Geraldine
/

msmarco-distilbert-base-v4-ead

Feature Extraction

text-embeddings-inference

Model card Files Files and versions Community

Geraldine commited on Nov 8, 2024

Commit

ea87379

·

verified ·

1 Parent(s): 4c7052a

Update README.md

Files changed (1) hide show

README.md +3 -20

README.md CHANGED Viewed

@@ -57,7 +57,6 @@ reduced_embeddings = pca.transform(embeddings.detach().numpy())
 from transformers import AutoModel, AutoTokenizer, pipeline
 import joblib
 from huggingface_hub import hf_hub_download
-from sklearn.decomposition import PCA
 # Load the embeddings model
 model = AutoModel.from_pretrained("Geraldine/msmarco-distilbert-base-v4-ead")
@@ -65,29 +64,13 @@ tokenizer = AutoTokenizer.from_pretrained("Geraldine/msmarco-distilbert-base-v4-
 # Load the PCA model
 pca_path = hf_hub_download("Geraldine/msmarco-distilbert-base-v4-ead", "pca_model.joblib")
-pca = joblib.load(pca_path)
 feature_extraction_pipeline = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
 class HuggingFaceEmbeddingFunction:
-    def __init__(self, pipeline, pca_model_path=None):
         self.pipeline = pipeline
-        self.pca = PCA(n_components=128) if pca_model_path is None else joblib.load(pca_model_path)
-        self.is_pca_fitted = pca_model_path is not None
-    def fit_pca(self, texts, save_path=None):
-        # Get embeddings as numpy arrays
-        embeddings = self.pipeline(texts)
-        embeddings = [embedding[0][0] for embedding in embeddings]
-        embeddings = np.array(embeddings)
-        # Fit PCA
-        self.pca.fit(embeddings)
-        self.is_pca_fitted = True
-        # Save PCA model if path is provided
-        if save_path:
-            joblib.dump(self.pca, save_path)
     # Function for embedding documents (lists of text)
     def embed_documents(self, texts):
@@ -138,7 +121,7 @@ If you use this model, please cite it as follows:
 ```bibtex
 @misc{geraldine2024eadxml,
-  author = {Your Name or Organization},
   title = {Geraldine/msmarco-distilbert-base-v4-ead: A DistilBERT Embedding Model for EAD/XML Text},
   year = {2024},
   howpublished = {\url{https://huggingface.co/Geraldine/msmarco-distilbert-base-v4-ead}},

 from transformers import AutoModel, AutoTokenizer, pipeline
 import joblib
 from huggingface_hub import hf_hub_download
 # Load the embeddings model
 model = AutoModel.from_pretrained("Geraldine/msmarco-distilbert-base-v4-ead")
 # Load the PCA model
 pca_path = hf_hub_download("Geraldine/msmarco-distilbert-base-v4-ead", "pca_model.joblib")
 feature_extraction_pipeline = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
 class HuggingFaceEmbeddingFunction:
+    def __init__(self, pipeline, pca_model_path):
         self.pipeline = pipeline
+        self.pca = joblib.load(pca_model_path)
     # Function for embedding documents (lists of text)
     def embed_documents(self, texts):
 ```bibtex
 @misc{geraldine2024eadxml,
+  author = {Géraldine Geoffroy},
   title = {Geraldine/msmarco-distilbert-base-v4-ead: A DistilBERT Embedding Model for EAD/XML Text},
   year = {2024},
   howpublished = {\url{https://huggingface.co/Geraldine/msmarco-distilbert-base-v4-ead}},