pinecone
/

ConstBERT

@@ -158,6 +158,7 @@ class ResourceSettings:
     collection: str = DefaultVal(None)
     queries: str = DefaultVal(None)
     index_name: str = DefaultVal(None)
 @dataclass
@@ -350,6 +351,7 @@ class BaseConfig(CoreConfig):
             return config
         try:
             checkpoint_path = hf_hub_download(
                 repo_id=checkpoint_path, filename="artifact.metadata"
@@ -360,6 +362,7 @@ class BaseConfig(CoreConfig):
         if os.path.exists(loaded_config_path):
             loaded_config, _ = cls.from_path(loaded_config_path)
             loaded_config.set("checkpoint", checkpoint_path)
             return loaded_config

     collection: str = DefaultVal(None)
     queries: str = DefaultVal(None)
     index_name: str = DefaultVal(None)
+    name_or_path: str = DefaultVal(None)
 @dataclass
             return config
+        name_or_path = checkpoint_path
         try:
             checkpoint_path = hf_hub_download(
                 repo_id=checkpoint_path, filename="artifact.metadata"
         if os.path.exists(loaded_config_path):
             loaded_config, _ = cls.from_path(loaded_config_path)
             loaded_config.set("checkpoint", checkpoint_path)
+            loaded_config.set("name_or_path", name_or_path)
             return loaded_config

modeling.py CHANGED Viewed

@@ -1,18 +1,11 @@
 import torch.nn as nn
 from transformers import BertPreTrainedModel, BertModel, AutoTokenizer
 import torch
 from tqdm import tqdm
-from transformers import AutoTokenizer
 from .colbert_configuration import ColBERTConfig
 from .tokenization_utils import QueryTokenizer, DocTokenizer
-# this is a hack to force huggingface hub to download the tokenizer files
-try:
-    with open("./tokenizer_config.json", "r") as f, open("./tokenizer.json", "r") as f2, open("./vocab.txt", "r") as f3:
-        pass
-except Exception as e:
-    pass
 class NullContextManager(object):
     def __init__(self, dummy_resource=None):
         self.dummy_resource = dummy_resource
@@ -70,6 +63,16 @@ class ConstBERT(BertPreTrainedModel):
         self.doc_project = nn.Linear(colbert_config.doc_maxlen, 32, bias=False)
         self.query_project = nn.Linear(colbert_config.query_maxlen, 64, bias=False)
         self.query_tokenizer = QueryTokenizer(colbert_config, verbose=verbose)
         self.doc_tokenizer = DocTokenizer(colbert_config)
         self.amp_manager = MixedPrecisionManager(True)

 import torch.nn as nn
 from transformers import BertPreTrainedModel, BertModel, AutoTokenizer
+from huggingface_hub import hf_hub_download
 import torch
 from tqdm import tqdm
 from .colbert_configuration import ColBERTConfig
 from .tokenization_utils import QueryTokenizer, DocTokenizer
+import os
 class NullContextManager(object):
     def __init__(self, dummy_resource=None):
         self.dummy_resource = dummy_resource
         self.doc_project = nn.Linear(colbert_config.doc_maxlen, 32, bias=False)
         self.query_project = nn.Linear(colbert_config.query_maxlen, 64, bias=False)
+        ## Download required tokenizer files from Hugging Face
+        if not os.path.exists(os.path.join(colbert_config.name_or_path, "tokenizer.json")):
+            hf_hub_download(repo_id=colbert_config.name_or_path, filename="tokenizer.json")
+        if not os.path.exists(os.path.join(colbert_config.name_or_path, "vocab.txt")):
+            hf_hub_download(repo_id=colbert_config.name_or_path, filename="vocab.txt")
+        if not os.path.exists(os.path.join(colbert_config.name_or_path, "tokenizer_config.json")):
+            hf_hub_download(repo_id=colbert_config.name_or_path, filename="tokenizer_config.json")
+        if not os.path.exists(os.path.join(colbert_config.name_or_path, "special_tokens_map.json")):
+            hf_hub_download(repo_id=colbert_config.name_or_path, filename="special_tokens_map.json")
         self.query_tokenizer = QueryTokenizer(colbert_config, verbose=verbose)
         self.doc_tokenizer = DocTokenizer(colbert_config)
         self.amp_manager = MixedPrecisionManager(True)