Update README.md
Browse files
README.md
CHANGED
@@ -57,7 +57,6 @@ reduced_embeddings = pca.transform(embeddings.detach().numpy())
|
|
57 |
from transformers import AutoModel, AutoTokenizer, pipeline
|
58 |
import joblib
|
59 |
from huggingface_hub import hf_hub_download
|
60 |
-
from sklearn.decomposition import PCA
|
61 |
|
62 |
# Load the embeddings model
|
63 |
model = AutoModel.from_pretrained("Geraldine/msmarco-distilbert-base-v4-ead")
|
@@ -65,29 +64,13 @@ tokenizer = AutoTokenizer.from_pretrained("Geraldine/msmarco-distilbert-base-v4-
|
|
65 |
|
66 |
# Load the PCA model
|
67 |
pca_path = hf_hub_download("Geraldine/msmarco-distilbert-base-v4-ead", "pca_model.joblib")
|
68 |
-
pca = joblib.load(pca_path)
|
69 |
|
70 |
feature_extraction_pipeline = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
|
71 |
|
72 |
class HuggingFaceEmbeddingFunction:
|
73 |
-
def __init__(self, pipeline, pca_model_path
|
74 |
self.pipeline = pipeline
|
75 |
-
self.pca =
|
76 |
-
self.is_pca_fitted = pca_model_path is not None
|
77 |
-
|
78 |
-
def fit_pca(self, texts, save_path=None):
|
79 |
-
# Get embeddings as numpy arrays
|
80 |
-
embeddings = self.pipeline(texts)
|
81 |
-
embeddings = [embedding[0][0] for embedding in embeddings]
|
82 |
-
embeddings = np.array(embeddings)
|
83 |
-
|
84 |
-
# Fit PCA
|
85 |
-
self.pca.fit(embeddings)
|
86 |
-
self.is_pca_fitted = True
|
87 |
-
|
88 |
-
# Save PCA model if path is provided
|
89 |
-
if save_path:
|
90 |
-
joblib.dump(self.pca, save_path)
|
91 |
|
92 |
# Function for embedding documents (lists of text)
|
93 |
def embed_documents(self, texts):
|
@@ -138,7 +121,7 @@ If you use this model, please cite it as follows:
|
|
138 |
|
139 |
```bibtex
|
140 |
@misc{geraldine2024eadxml,
|
141 |
-
author = {
|
142 |
title = {Geraldine/msmarco-distilbert-base-v4-ead: A DistilBERT Embedding Model for EAD/XML Text},
|
143 |
year = {2024},
|
144 |
howpublished = {\url{https://huggingface.co/Geraldine/msmarco-distilbert-base-v4-ead}},
|
|
|
57 |
from transformers import AutoModel, AutoTokenizer, pipeline
|
58 |
import joblib
|
59 |
from huggingface_hub import hf_hub_download
|
|
|
60 |
|
61 |
# Load the embeddings model
|
62 |
model = AutoModel.from_pretrained("Geraldine/msmarco-distilbert-base-v4-ead")
|
|
|
64 |
|
65 |
# Load the PCA model
|
66 |
pca_path = hf_hub_download("Geraldine/msmarco-distilbert-base-v4-ead", "pca_model.joblib")
|
|
|
67 |
|
68 |
feature_extraction_pipeline = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
|
69 |
|
70 |
class HuggingFaceEmbeddingFunction:
|
71 |
+
def __init__(self, pipeline, pca_model_path):
|
72 |
self.pipeline = pipeline
|
73 |
+
self.pca = joblib.load(pca_model_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# Function for embedding documents (lists of text)
|
76 |
def embed_documents(self, texts):
|
|
|
121 |
|
122 |
```bibtex
|
123 |
@misc{geraldine2024eadxml,
|
124 |
+
author = {Géraldine Geoffroy},
|
125 |
title = {Geraldine/msmarco-distilbert-base-v4-ead: A DistilBERT Embedding Model for EAD/XML Text},
|
126 |
year = {2024},
|
127 |
howpublished = {\url{https://huggingface.co/Geraldine/msmarco-distilbert-base-v4-ead}},
|