Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse files- modules/SD15/SDToken.py +30 -7
modules/SD15/SDToken.py
CHANGED
@@ -236,32 +236,49 @@ class SDTokenizer:
|
|
236 |
- `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
|
237 |
- `min_length` (int, optional): The minimum length of the input. Defaults to None.
|
238 |
"""
|
|
|
239 |
if tokenizer_path is None:
|
240 |
-
tokenizer_path = "
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
self.max_length = max_length
|
243 |
self.min_length = min_length
|
244 |
-
|
|
|
245 |
empty = self.tokenizer("")["input_ids"]
|
|
|
246 |
if has_start_token:
|
247 |
self.tokens_start = 1
|
248 |
self.start_token = empty[0]
|
249 |
self.end_token = empty[1]
|
250 |
else:
|
251 |
self.tokens_start = 0
|
252 |
-
self.start_token = None
|
253 |
self.end_token = empty[0]
|
|
|
254 |
self.pad_with_end = pad_with_end
|
255 |
self.pad_to_max_length = pad_to_max_length
|
256 |
|
|
|
257 |
vocab = self.tokenizer.get_vocab()
|
258 |
self.inv_vocab = {v: k for k, v in vocab.items()}
|
|
|
|
|
259 |
self.embedding_directory = embedding_directory
|
260 |
self.max_word_length = 8
|
261 |
self.embedding_identifier = "embedding:"
|
262 |
self.embedding_size = embedding_size
|
263 |
self.embedding_key = embedding_key
|
264 |
|
|
|
265 |
def _try_get_embedding(self, embedding_name: str) -> tuple:
|
266 |
"""#### Try to get an embedding.
|
267 |
|
@@ -410,7 +427,7 @@ class SDTokenizer:
|
|
410 |
class SD1Tokenizer:
|
411 |
"""#### Class representing the SD1Tokenizer."""
|
412 |
|
413 |
-
def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer):
|
414 |
"""#### Initialize the SD1Tokenizer.
|
415 |
|
416 |
#### Args:
|
@@ -419,8 +436,14 @@ class SD1Tokenizer:
|
|
419 |
- `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
|
420 |
"""
|
421 |
self.clip_name = clip_name
|
422 |
-
self.clip = "clip_{
|
423 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
|
425 |
def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
|
426 |
"""#### Tokenize text with weights.
|
|
|
236 |
- `pad_to_max_length` (bool, optional): Whether to pad to the maximum length. Defaults to True.
|
237 |
- `min_length` (int, optional): The minimum length of the input. Defaults to None.
|
238 |
"""
|
239 |
+
# Ensure tokenizer path exists
|
240 |
if tokenizer_path is None:
|
241 |
+
tokenizer_path = os.path.join(os.path.dirname(__file__), "../_internal/sd1_tokenizer")
|
242 |
+
|
243 |
+
# Verify path exists
|
244 |
+
if not os.path.exists(tokenizer_path):
|
245 |
+
raise ValueError(f"Tokenizer path does not exist: {tokenizer_path}")
|
246 |
+
|
247 |
+
try:
|
248 |
+
self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
|
249 |
+
except Exception as e:
|
250 |
+
raise RuntimeError(f"Failed to load tokenizer from {tokenizer_path}: {str(e)}")
|
251 |
+
|
252 |
self.max_length = max_length
|
253 |
self.min_length = min_length
|
254 |
+
|
255 |
+
# Get tokens from empty string tokenization
|
256 |
empty = self.tokenizer("")["input_ids"]
|
257 |
+
|
258 |
if has_start_token:
|
259 |
self.tokens_start = 1
|
260 |
self.start_token = empty[0]
|
261 |
self.end_token = empty[1]
|
262 |
else:
|
263 |
self.tokens_start = 0
|
264 |
+
self.start_token = None
|
265 |
self.end_token = empty[0]
|
266 |
+
|
267 |
self.pad_with_end = pad_with_end
|
268 |
self.pad_to_max_length = pad_to_max_length
|
269 |
|
270 |
+
# Create vocab lookup
|
271 |
vocab = self.tokenizer.get_vocab()
|
272 |
self.inv_vocab = {v: k for k, v in vocab.items()}
|
273 |
+
|
274 |
+
# Set embedding properties
|
275 |
self.embedding_directory = embedding_directory
|
276 |
self.max_word_length = 8
|
277 |
self.embedding_identifier = "embedding:"
|
278 |
self.embedding_size = embedding_size
|
279 |
self.embedding_key = embedding_key
|
280 |
|
281 |
+
|
282 |
def _try_get_embedding(self, embedding_name: str) -> tuple:
|
283 |
"""#### Try to get an embedding.
|
284 |
|
|
|
427 |
class SD1Tokenizer:
|
428 |
"""#### Class representing the SD1Tokenizer."""
|
429 |
|
430 |
+
def __init__(self, embedding_directory: str = None, clip_name: str = "l", tokenizer: type = SDTokenizer, tokenizer_data: dict = None):
|
431 |
"""#### Initialize the SD1Tokenizer.
|
432 |
|
433 |
#### Args:
|
|
|
436 |
- `tokenizer` (type, optional): The tokenizer class. Defaults to SDTokenizer.
|
437 |
"""
|
438 |
self.clip_name = clip_name
|
439 |
+
self.clip = f"clip_{self.clip_name}"
|
440 |
+
|
441 |
+
# Initialize tokenizer with proper arguments
|
442 |
+
kwargs = {"embedding_directory": embedding_directory}
|
443 |
+
if tokenizer_data:
|
444 |
+
kwargs.update(tokenizer_data)
|
445 |
+
|
446 |
+
setattr(self, self.clip, tokenizer(**kwargs))
|
447 |
|
448 |
def tokenize_with_weights(self, text: str, return_word_ids: bool = False) -> dict:
|
449 |
"""#### Tokenize text with weights.
|