|
import shutil |
|
import zipfile |
|
import json |
|
import bm25s |
|
|
|
import nltk |
|
from nltk.stem import WordNetLemmatizer |
|
|
|
nltk.download("wordnet") |
|
lemmatizer = WordNetLemmatizer() |
|
indexer_id = "3gpp_bm25_docs" |
|
unique_specs = set() |
|
|
|
with open("indexed_specifications.json", "r") as f: |
|
spec_data = json.load(f)["specs"] |
|
with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf: |
|
for file_name in zf.namelist(): |
|
if file_name.endswith(".json"): |
|
doc_bytes = zf.read(file_name) |
|
try: |
|
doc_data = json.loads(doc_bytes.decode("utf-8")) |
|
print("Documents loaded successfully !") |
|
except json.JSONDecodeError as e: |
|
print(f"Error while decoding the JSON file {file_name}: {e}") |
|
|
|
corpus_json = [] |
|
|
|
for _, specification in spec_data.items(): |
|
full_text = f"{specification['id']} - {specification['title']}\n\n\n" |
|
if specification['id'] in unique_specs: |
|
continue |
|
document = doc_data.get(specification['id'], None) |
|
if document is None: continue |
|
if not isinstance(document, str): |
|
full_text += "\n".join([f"{title}\n\n{document[title]}" for title in document.keys()]) |
|
corpus_json.append({"text": lemmatizer.lemmatize(full_text), "metadata": { |
|
"id": specification['id'], |
|
"title": specification['title'], |
|
"version": specification['version'], |
|
"release": specification['release'], |
|
"type": specification['type'], |
|
"working_group": specification['working_group'], |
|
"url": specification['url'], |
|
"scope": specification['scope'] |
|
}}) |
|
unique_specs.add(specification['id']) |
|
else: |
|
print(f"Skipping {specification['id']}") |
|
unique_specs.add(specification['id']) |
|
|
|
corpus_text = [doc["text"] for doc in corpus_json] |
|
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") |
|
|
|
retriever = bm25s.BM25(corpus=corpus_json) |
|
retriever.index(corpus_tokens) |
|
|
|
retriever.save(indexer_id) |
|
|
|
shutil.make_archive("bm25s", 'zip', '.', indexer_id) |