import shutil import zipfile import json import bm25s import nltk from nltk.stem import WordNetLemmatizer nltk.download("wordnet") lemmatizer = WordNetLemmatizer() indexer_id = "3gpp_bm25_docs" unique_specs = set() with open("indexed_specifications.json", "r") as f: spec_data = json.load(f)["specs"] with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf: for file_name in zf.namelist(): if file_name.endswith(".json"): doc_bytes = zf.read(file_name) try: doc_data = json.loads(doc_bytes.decode("utf-8")) print("Documents loaded successfully !") except json.JSONDecodeError as e: print(f"Error while decoding the JSON file {file_name}: {e}") corpus_json = [] for _, specification in spec_data.items(): full_text = f"{specification['id']} - {specification['title']}\n\n\n" if specification['id'] in unique_specs: continue document = doc_data.get(specification['id'], None) if document is None: continue if not isinstance(document, str): full_text += "\n".join([f"{title}\n\n{document[title]}" for title in document.keys()]) corpus_json.append({"text": lemmatizer.lemmatize(full_text), "metadata": { "id": specification['id'], "title": specification['title'], "version": specification['version'], "release": specification['release'], "type": specification['type'], "working_group": specification['working_group'], "url": specification['url'], "scope": specification['scope'] }}) unique_specs.add(specification['id']) else: print(f"Skipping {specification['id']}") unique_specs.add(specification['id']) corpus_text = [doc["text"] for doc in corpus_json] corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") retriever = bm25s.BM25(corpus=corpus_json) retriever.index(corpus_tokens) retriever.save(indexer_id) shutil.make_archive("bm25s", 'zip', '.', indexer_id)