Aatricks commited on
Commit
fc1f404
·
verified ·
1 Parent(s): 2f972a1

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. modules/SD15/SDToken.py +30 -7
modules/SD15/SDToken.py CHANGED
@@ -236,32 +236,49 @@ class SDTokenizer:
236
  - `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
237
  - `min_length` (int, optional): The minimum length of the input. Defaults to None.
238
  """
 
239
  if tokenizer_path is None:
240
- tokenizer_path = "./_internal/sd1_tokenizer/"
241
- self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
 
 
 
 
 
 
 
 
 
242
  self.max_length = max_length
243
  self.min_length = min_length
244
-
 
245
  empty = self.tokenizer("")["input_ids"]
 
246
  if has_start_token:
247
  self.tokens_start = 1
248
  self.start_token = empty[0]
249
  self.end_token = empty[1]
250
  else:
251
  self.tokens_start = 0
252
- self.start_token = None
253
  self.end_token = empty[0]
 
254
  self.pad_with_end = pad_with_end
255
  self.pad_to_max_length = pad_to_max_length
256
 
 
257
  vocab = self.tokenizer.get_vocab()
258
  self.inv_vocab = {v: k for k, v in vocab.items()}
 
 
259
  self.embedding_directory = embedding_directory
260
  self.max_word_length = 8
261
  self.embedding_identifier = "embedding:"
262
  self.embedding_size = embedding_size
263
  self.embedding_key = embedding_key
264
 
 
265
  def _try_get_embedding(self, embedding_name: str) -> tuple:
266
  """#### Try to get an embedding.
267
 
@@ -410,7 +427,7 @@ class SDTokenizer:
410
  class SD1Tokenizer:
411
  """#### Class representing the SD1Tokenizer."""
412
 
413
- def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer):
414
  """#### Initialize the SD1Tokenizer.
415
 
416
  #### Args:
@@ -419,8 +436,14 @@ class SD1Tokenizer:
419
  - `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
420
  """
421
  self.clip_name = clip_name
422
- self.clip = "clip_{}".format(self.clip_name)
423
- setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory))
 
 
 
 
 
 
424
 
425
  def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
426
  """#### Tokenize text with weights.
 
236
  - `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
237
  - `min_length` (int, optional): The minimum length of the input. Defaults to None.
238
  """
239
+ # Ensure tokenizer path exists
240
  if tokenizer_path is None:
241
+ tokenizer_path = os.path.join(os.path.dirname(__file__), "../_internal/sd1_tokenizer")
242
+
243
+ # Verify path exists
244
+ if not os.path.exists(tokenizer_path):
245
+ raise ValueError(f"Tokenizer path does not exist: {tokenizer_path}")
246
+
247
+ try:
248
+ self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
249
+ except Exception as e:
250
+ raise RuntimeError(f"Failed to load tokenizer from {tokenizer_path}: {str(e)}")
251
+
252
  self.max_length = max_length
253
  self.min_length = min_length
254
+
255
+ # Get tokens from empty string tokenization
256
  empty = self.tokenizer("")["input_ids"]
257
+
258
  if has_start_token:
259
  self.tokens_start = 1
260
  self.start_token = empty[0]
261
  self.end_token = empty[1]
262
  else:
263
  self.tokens_start = 0
264
+ self.start_token = None
265
  self.end_token = empty[0]
266
+
267
  self.pad_with_end = pad_with_end
268
  self.pad_to_max_length = pad_to_max_length
269
 
270
+ # Create vocab lookup
271
  vocab = self.tokenizer.get_vocab()
272
  self.inv_vocab = {v: k for k, v in vocab.items()}
273
+
274
+ # Set embedding properties
275
  self.embedding_directory = embedding_directory
276
  self.max_word_length = 8
277
  self.embedding_identifier = "embedding:"
278
  self.embedding_size = embedding_size
279
  self.embedding_key = embedding_key
280
 
281
+
282
  def _try_get_embedding(self, embedding_name: str) -> tuple:
283
  """#### Try to get an embedding.
284
 
 
427
  class SD1Tokenizer:
428
  """#### Class representing the SD1Tokenizer."""
429
 
430
+ def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer, tokenizer_data: dict = None):
431
  """#### Initialize the SD1Tokenizer.
432
 
433
  #### Args:
 
436
  - `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
437
  """
438
  self.clip_name = clip_name
439
+ self.clip = f"clip_{self.clip_name}"
440
+
441
+ # Initialize tokenizer with proper arguments
442
+ kwargs = {"embedding_directory": embedding_directory}
443
+ if tokenizer_data:
444
+ kwargs.update(tokenizer_data)
445
+
446
+ setattr(self, self.clip, tokenizer(**kwargs))
447
 
448
  def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
449
  """#### Tokenize text with weights.