3GPPindexers / bm25_maker.py
om4r932's picture
Final version
f2f17e7
import shutil
import zipfile
import json
import bm25s
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()
indexer_id = "3gpp_bm25_docs"
unique_specs = set()
with open("indexed_specifications.json", "r") as f:
spec_data = json.load(f)["specs"]
with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf:
for file_name in zf.namelist():
if file_name.endswith(".json"):
doc_bytes = zf.read(file_name)
try:
doc_data = json.loads(doc_bytes.decode("utf-8"))
print("Documents loaded successfully !")
except json.JSONDecodeError as e:
print(f"Error while decoding the JSON file {file_name}: {e}")
corpus_json = []
for _, specification in spec_data.items():
full_text = f"{specification['id']} - {specification['title']}\n\n\n"
if specification['id'] in unique_specs:
continue
document = doc_data.get(specification['id'], None)
if document is None: continue
if not isinstance(document, str):
full_text += "\n".join([f"{title}\n\n{document[title]}" for title in document.keys()])
corpus_json.append({"text": lemmatizer.lemmatize(full_text), "metadata": {
"id": specification['id'],
"title": specification['title'],
"version": specification['version'],
"release": specification['release'],
"type": specification['type'],
"working_group": specification['working_group'],
"url": specification['url'],
"scope": specification['scope']
}})
unique_specs.add(specification['id'])
else:
print(f"Skipping {specification['id']}")
unique_specs.add(specification['id'])
corpus_text = [doc["text"] for doc in corpus_json]
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
retriever = bm25s.BM25(corpus=corpus_json)
retriever.index(corpus_tokens)
retriever.save(indexer_id)
shutil.make_archive("bm25s", 'zip', '.', indexer_id)