Aatricks commited on
Commit
7c12c07
·
verified ·
1 Parent(s): bb1967b

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. modules/SD15/SDToken.py +7 -35
modules/SD15/SDToken.py CHANGED
@@ -236,54 +236,32 @@ class SDTokenizer:
236
  - `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
237
  - `min_length` (int, optional): The minimum length of the input. Defaults to None.
238
  """
239
- # Ensure tokenizer path exists
240
  if tokenizer_path is None:
241
- tokenizer_path = "./_internal/sd1_tokenizer/"
242
-
243
- # Verify path exists
244
- if not os.path.exists(tokenizer_path):
245
- raise ValueError(f"Tokenizer path does not exist: {tokenizer_path}")
246
-
247
- try:
248
- if tokenizer_path is None:
249
- # Use pre-bundled tokenizer
250
- self.tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
251
- else:
252
- # Try local tokenizer files
253
- self.tokenizer = CLIPTokenizerFast.from_pretrained(tokenizer_path)
254
- except Exception as e:
255
- raise RuntimeError(f"Failed to load tokenizer from {tokenizer_path}: {str(e)}")
256
-
257
  self.max_length = max_length
258
  self.min_length = min_length
259
-
260
- # Get tokens from empty string tokenization
261
  empty = self.tokenizer("")["input_ids"]
262
-
263
  if has_start_token:
264
  self.tokens_start = 1
265
  self.start_token = empty[0]
266
  self.end_token = empty[1]
267
  else:
268
  self.tokens_start = 0
269
- self.start_token = None
270
  self.end_token = empty[0]
271
-
272
  self.pad_with_end = pad_with_end
273
  self.pad_to_max_length = pad_to_max_length
274
 
275
- # Create vocab lookup
276
  vocab = self.tokenizer.get_vocab()
277
  self.inv_vocab = {v: k for k, v in vocab.items()}
278
-
279
- # Set embedding properties
280
  self.embedding_directory = embedding_directory
281
  self.max_word_length = 8
282
  self.embedding_identifier = "embedding:"
283
  self.embedding_size = embedding_size
284
  self.embedding_key = embedding_key
285
 
286
-
287
  def _try_get_embedding(self, embedding_name: str) -> tuple:
288
  """#### Try to get an embedding.
289
 
@@ -432,7 +410,7 @@ class SDTokenizer:
432
  class SD1Tokenizer:
433
  """#### Class representing the SD1Tokenizer."""
434
 
435
- def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer, tokenizer_data: dict = None):
436
  """#### Initialize the SD1Tokenizer.
437
 
438
  #### Args:
@@ -441,14 +419,8 @@ class SD1Tokenizer:
441
  - `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
442
  """
443
  self.clip_name = clip_name
444
- self.clip = f"clip_{self.clip_name}"
445
-
446
- # Initialize tokenizer with proper arguments
447
- kwargs = {"embedding_directory": embedding_directory}
448
- if tokenizer_data:
449
- kwargs.update(tokenizer_data)
450
-
451
- setattr(self, self.clip, tokenizer(**kwargs))
452
 
453
  def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
454
  """#### Tokenize text with weights.
 
236
  - `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
237
  - `min_length` (int, optional): The minimum length of the input. Defaults to None.
238
  """
 
239
  if tokenizer_path is None:
240
+ tokenizer_path = os.path.join("_internal/sd1_tokenizer/", "")
241
+ self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  self.max_length = max_length
243
  self.min_length = min_length
244
+
 
245
  empty = self.tokenizer("")["input_ids"]
 
246
  if has_start_token:
247
  self.tokens_start = 1
248
  self.start_token = empty[0]
249
  self.end_token = empty[1]
250
  else:
251
  self.tokens_start = 0
252
+ self.start_token = None
253
  self.end_token = empty[0]
 
254
  self.pad_with_end = pad_with_end
255
  self.pad_to_max_length = pad_to_max_length
256
 
 
257
  vocab = self.tokenizer.get_vocab()
258
  self.inv_vocab = {v: k for k, v in vocab.items()}
 
 
259
  self.embedding_directory = embedding_directory
260
  self.max_word_length = 8
261
  self.embedding_identifier = "embedding:"
262
  self.embedding_size = embedding_size
263
  self.embedding_key = embedding_key
264
 
 
265
  def _try_get_embedding(self, embedding_name: str) -> tuple:
266
  """#### Try to get an embedding.
267
 
 
410
  class SD1Tokenizer:
411
  """#### Class representing the SD1Tokenizer."""
412
 
413
+ def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer):
414
  """#### Initialize the SD1Tokenizer.
415
 
416
  #### Args:
 
419
  - `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
420
  """
421
  self.clip_name = clip_name
422
+ self.clip = "clip_{}".format(self.clip_name)
423
+ setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory))
 
 
 
 
 
 
424
 
425
  def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
426
  """#### Tokenize text with weights.