Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse files- modules/SD15/SDToken.py +7 -35
modules/SD15/SDToken.py
CHANGED
@@ -236,54 +236,32 @@ class SDTokenizer:
|
|
236 |
- `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
|
237 |
- `min_length` (int, optional): The minimum length of the input. Defaults to None.
|
238 |
"""
|
239 |
-
# Ensure tokenizer path exists
|
240 |
if tokenizer_path is None:
|
241 |
-
tokenizer_path = "
|
242 |
-
|
243 |
-
# Verify path exists
|
244 |
-
if not os.path.exists(tokenizer_path):
|
245 |
-
raise ValueError(f"Tokenizer path does not exist: {tokenizer_path}")
|
246 |
-
|
247 |
-
try:
|
248 |
-
if tokenizer_path is None:
|
249 |
-
# Use pre-bundled tokenizer
|
250 |
-
self.tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
|
251 |
-
else:
|
252 |
-
# Try local tokenizer files
|
253 |
-
self.tokenizer = CLIPTokenizerFast.from_pretrained(tokenizer_path)
|
254 |
-
except Exception as e:
|
255 |
-
raise RuntimeError(f"Failed to load tokenizer from {tokenizer_path}: {str(e)}")
|
256 |
-
|
257 |
self.max_length = max_length
|
258 |
self.min_length = min_length
|
259 |
-
|
260 |
-
# Get tokens from empty string tokenization
|
261 |
empty = self.tokenizer("")["input_ids"]
|
262 |
-
|
263 |
if has_start_token:
|
264 |
self.tokens_start = 1
|
265 |
self.start_token = empty[0]
|
266 |
self.end_token = empty[1]
|
267 |
else:
|
268 |
self.tokens_start = 0
|
269 |
-
self.start_token = None
|
270 |
self.end_token = empty[0]
|
271 |
-
|
272 |
self.pad_with_end = pad_with_end
|
273 |
self.pad_to_max_length = pad_to_max_length
|
274 |
|
275 |
-
# Create vocab lookup
|
276 |
vocab = self.tokenizer.get_vocab()
|
277 |
self.inv_vocab = {v: k for k, v in vocab.items()}
|
278 |
-
|
279 |
-
# Set embedding properties
|
280 |
self.embedding_directory = embedding_directory
|
281 |
self.max_word_length = 8
|
282 |
self.embedding_identifier = "embedding:"
|
283 |
self.embedding_size = embedding_size
|
284 |
self.embedding_key = embedding_key
|
285 |
|
286 |
-
|
287 |
def _try_get_embedding(self, embedding_name: str) -> tuple:
|
288 |
"""#### Try to get an embedding.
|
289 |
|
@@ -432,7 +410,7 @@ class SDTokenizer:
|
|
432 |
class SD1Tokenizer:
|
433 |
"""#### Class representing the SD1Tokenizer."""
|
434 |
|
435 |
-
def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer
|
436 |
"""#### Initialize the SD1Tokenizer.
|
437 |
|
438 |
#### Args:
|
@@ -441,14 +419,8 @@ class SD1Tokenizer:
|
|
441 |
- `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
|
442 |
"""
|
443 |
self.clip_name = clip_name
|
444 |
-
self.clip =
|
445 |
-
|
446 |
-
# Initialize tokenizer with proper arguments
|
447 |
-
kwargs = {"embedding_directory": embedding_directory}
|
448 |
-
if tokenizer_data:
|
449 |
-
kwargs.update(tokenizer_data)
|
450 |
-
|
451 |
-
setattr(self, self.clip, tokenizer(**kwargs))
|
452 |
|
453 |
def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
|
454 |
"""#### Tokenize text with weights.
|
|
|
236 |
- `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
|
237 |
- `min_length` (int, optional): The minimum length of the input. Defaults to None.
|
238 |
"""
|
|
|
239 |
if tokenizer_path is None:
|
240 |
+
tokenizer_path = os.path.join("_internal/sd1_tokenizer/", "")
|
241 |
+
self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
self.max_length = max_length
|
243 |
self.min_length = min_length
|
244 |
+
|
|
|
245 |
empty = self.tokenizer("")["input_ids"]
|
|
|
246 |
if has_start_token:
|
247 |
self.tokens_start = 1
|
248 |
self.start_token = empty[0]
|
249 |
self.end_token = empty[1]
|
250 |
else:
|
251 |
self.tokens_start = 0
|
252 |
+
self.start_token = None
|
253 |
self.end_token = empty[0]
|
|
|
254 |
self.pad_with_end = pad_with_end
|
255 |
self.pad_to_max_length = pad_to_max_length
|
256 |
|
|
|
257 |
vocab = self.tokenizer.get_vocab()
|
258 |
self.inv_vocab = {v: k for k, v in vocab.items()}
|
|
|
|
|
259 |
self.embedding_directory = embedding_directory
|
260 |
self.max_word_length = 8
|
261 |
self.embedding_identifier = "embedding:"
|
262 |
self.embedding_size = embedding_size
|
263 |
self.embedding_key = embedding_key
|
264 |
|
|
|
265 |
def _try_get_embedding(self, embedding_name: str) -> tuple:
|
266 |
"""#### Try to get an embedding.
|
267 |
|
|
|
410 |
class SD1Tokenizer:
|
411 |
"""#### Class representing the SD1Tokenizer."""
|
412 |
|
413 |
+
def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer):
|
414 |
"""#### Initialize the SD1Tokenizer.
|
415 |
|
416 |
#### Args:
|
|
|
419 |
- `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
|
420 |
"""
|
421 |
self.clip_name = clip_name
|
422 |
+
self.clip = "clip_{}".format(self.clip_name)
|
423 |
+
setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory))
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
|
425 |
def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
|
426 |
"""#### Tokenize text with weights.
|