Geraldine commited on
Commit
ea87379
·
verified ·
1 Parent(s): 4c7052a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -20
README.md CHANGED
@@ -57,7 +57,6 @@ reduced_embeddings = pca.transform(embeddings.detach().numpy())
57
  from transformers import AutoModel, AutoTokenizer, pipeline
58
  import joblib
59
  from huggingface_hub import hf_hub_download
60
- from sklearn.decomposition import PCA
61
 
62
  # Load the embeddings model
63
  model = AutoModel.from_pretrained("Geraldine/msmarco-distilbert-base-v4-ead")
@@ -65,29 +64,13 @@ tokenizer = AutoTokenizer.from_pretrained("Geraldine/msmarco-distilbert-base-v4-
65
 
66
  # Load the PCA model
67
  pca_path = hf_hub_download("Geraldine/msmarco-distilbert-base-v4-ead", "pca_model.joblib")
68
- pca = joblib.load(pca_path)
69
 
70
  feature_extraction_pipeline = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
71
 
72
  class HuggingFaceEmbeddingFunction:
73
- def __init__(self, pipeline, pca_model_path=None):
74
  self.pipeline = pipeline
75
- self.pca = PCA(n_components=128) if pca_model_path is None else joblib.load(pca_model_path)
76
- self.is_pca_fitted = pca_model_path is not None
77
-
78
- def fit_pca(self, texts, save_path=None):
79
- # Get embeddings as numpy arrays
80
- embeddings = self.pipeline(texts)
81
- embeddings = [embedding[0][0] for embedding in embeddings]
82
- embeddings = np.array(embeddings)
83
-
84
- # Fit PCA
85
- self.pca.fit(embeddings)
86
- self.is_pca_fitted = True
87
-
88
- # Save PCA model if path is provided
89
- if save_path:
90
- joblib.dump(self.pca, save_path)
91
 
92
  # Function for embedding documents (lists of text)
93
  def embed_documents(self, texts):
@@ -138,7 +121,7 @@ If you use this model, please cite it as follows:
138
 
139
  ```bibtex
140
  @misc{geraldine2024eadxml,
141
- author = {Your Name or Organization},
142
  title = {Geraldine/msmarco-distilbert-base-v4-ead: A DistilBERT Embedding Model for EAD/XML Text},
143
  year = {2024},
144
  howpublished = {\url{https://huggingface.co/Geraldine/msmarco-distilbert-base-v4-ead}},
 
57
  from transformers import AutoModel, AutoTokenizer, pipeline
58
  import joblib
59
  from huggingface_hub import hf_hub_download
 
60
 
61
  # Load the embeddings model
62
  model = AutoModel.from_pretrained("Geraldine/msmarco-distilbert-base-v4-ead")
 
64
 
65
  # Load the PCA model
66
  pca_path = hf_hub_download("Geraldine/msmarco-distilbert-base-v4-ead", "pca_model.joblib")
 
67
 
68
  feature_extraction_pipeline = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
69
 
70
  class HuggingFaceEmbeddingFunction:
71
+ def __init__(self, pipeline, pca_model_path):
72
  self.pipeline = pipeline
73
+ self.pca = joblib.load(pca_model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Function for embedding documents (lists of text)
76
  def embed_documents(self, texts):
 
121
 
122
  ```bibtex
123
  @misc{geraldine2024eadxml,
124
+ author = {Géraldine Geoffroy},
125
  title = {Geraldine/msmarco-distilbert-base-v4-ead: A DistilBERT Embedding Model for EAD/XML Text},
126
  year = {2024},
127
  howpublished = {\url{https://huggingface.co/Geraldine/msmarco-distilbert-base-v4-ead}},