Spaces:
Running
Running
use lightweight multilingual model
Browse files- README.md +1 -2
- pinecone_handler.py +5 -3
- settings.py +2 -1
- timestamp2.txt +1 -1
README.md
CHANGED
@@ -61,5 +61,4 @@ Querying from the Pinecone vector database is simple and fast thanks to the Pine
|
|
61 |
|
62 |
1. The [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) truncates input text longer than 256 word pieces. To capture all the semantics from job listings, we probably need a sentence transformer which can embed longer inputs texts.
|
63 |
2. The [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) is not optimized for multilingual text. Many people in Sweden have their resumes in Swedish, so better performance would probably achieved with a multilingual model.
|
64 |
-
3.
|
65 |
-
4. Users should be able to filter on municipality or location, because the current app ignores where the person wants to work (often not explicitly mentioned in their resume), making many job listings not relevant anyway.
|
|
|
61 |
|
62 |
1. The [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) truncates input text longer than 256 word pieces. To capture all the semantics from job listings, we probably need a sentence transformer which can embed longer inputs texts.
|
63 |
2. The [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) is not optimized for multilingual text. Many people in Sweden have their resumes in Swedish, so better performance would probably achieved with a multilingual model.
|
64 |
+
3. Users should be able to filter on municipality or location, because the current app ignores where the person wants to work (often not explicitly mentioned in their resume), making many job listings not relevant anyway.
|
|
pinecone_handler.py
CHANGED
@@ -40,20 +40,22 @@ class PineconeHandler:
|
|
40 |
log.info(f"Creating new index '{PINECONE_INDEX_NAME}'")
|
41 |
spec = ServerlessSpec(
|
42 |
cloud="aws",
|
43 |
-
region="us-
|
44 |
)
|
45 |
|
46 |
self.pc.create_index(
|
47 |
name=PINECONE_INDEX_NAME,
|
48 |
-
dimension=
|
49 |
metric="cosine",
|
50 |
spec=spec
|
51 |
)
|
52 |
self.index = self.pc.Index(PINECONE_INDEX_NAME)
|
53 |
|
54 |
#self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
|
55 |
#512 token max length, embedding dim 768
|
56 |
-
self.model = SentenceTransformer('sentence-transformers/allenai-specter')
|
57 |
log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
|
58 |
|
59 |
def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
|
|
|
40 |
log.info(f"Creating new index '{PINECONE_INDEX_NAME}'")
|
41 |
spec = ServerlessSpec(
|
42 |
cloud="aws",
|
43 |
+
region="us-east-1"
|
44 |
)
|
45 |
|
46 |
self.pc.create_index(
|
47 |
name=PINECONE_INDEX_NAME,
|
48 |
+
dimension=384,
|
49 |
metric="cosine",
|
50 |
spec=spec
|
51 |
)
|
52 |
self.index = self.pc.Index(PINECONE_INDEX_NAME)
|
53 |
|
54 |
#self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
55 |
+
#self.model = SentenceTransformer('intfloat/multilingual-e5-large')
|
56 |
+
self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
57 |
#512 token max length, embedding dim 768
|
58 |
+
#self.model = SentenceTransformer('sentence-transformers/allenai-specter')
|
59 |
log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
|
60 |
|
61 |
def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
|
settings.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import logging
|
2 |
|
3 |
PINECONE_ENVIRONMENT = "gcp-starter"
|
4 |
-
PINECONE_INDEX_NAME = "jobads-index"
|
|
|
5 |
|
6 |
DB_TABLE_NAME = 'jobads'
|
7 |
DB_FILE_NAME = 'jobads_database_20220127.db'
|
|
|
1 |
import logging
|
2 |
|
3 |
PINECONE_ENVIRONMENT = "gcp-starter"
|
4 |
+
#PINECONE_INDEX_NAME = "jobads-index"
|
5 |
+
PINECONE_INDEX_NAME = "jobsai-multilingual-small"
|
6 |
|
7 |
DB_TABLE_NAME = 'jobads'
|
8 |
DB_FILE_NAME = 'jobads_database_20220127.db'
|
timestamp2.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
2025-01-
|
|
|
1 |
+
2025-01-05T22:38:10
|