Spaces:
Build error
Build error
Upload 17 files
Browse files- app.py +34 -7
- requirements.txt +1 -1
- utils/models.py +60 -13
- utils/nltkmodules.py +3 -2
- utils/retriever.py +120 -47
- utils/vector_index.py +13 -1
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import re
|
| 2 |
-
|
| 3 |
import openai
|
| 4 |
import streamlit_scrollable_textbox as stx
|
| 5 |
|
|
@@ -8,23 +8,27 @@ import streamlit as st
|
|
| 8 |
|
| 9 |
st.set_page_config(layout="wide") # isort: split
|
| 10 |
|
|
|
|
| 11 |
from utils.entity_extraction import (
|
| 12 |
clean_entities,
|
|
|
|
| 13 |
extract_quarter_year,
|
| 14 |
extract_ticker_spacy,
|
| 15 |
format_entities_flan_alpaca,
|
| 16 |
generate_alpaca_ner_prompt,
|
| 17 |
-
extract_keywords
|
| 18 |
)
|
| 19 |
from utils.models import (
|
| 20 |
generate_entities_flan_alpaca_checkpoint,
|
| 21 |
generate_entities_flan_alpaca_inference_api,
|
| 22 |
generate_text_flan_t5,
|
| 23 |
-
get_data,
|
| 24 |
get_alpaca_model,
|
|
|
|
| 25 |
get_flan_alpaca_xl_model,
|
| 26 |
get_flan_t5_model,
|
| 27 |
get_instructor_embedding_model,
|
|
|
|
|
|
|
|
|
|
| 28 |
get_mpnet_embedding_model,
|
| 29 |
get_sgpt_embedding_model,
|
| 30 |
get_spacy_model,
|
|
@@ -55,6 +59,7 @@ from utils.retriever import (
|
|
| 55 |
sentence_id_combine,
|
| 56 |
text_lookup,
|
| 57 |
year_quarter_range,
|
|
|
|
| 58 |
)
|
| 59 |
from utils.transcript_retrieval import retrieve_transcript
|
| 60 |
from utils.vector_index import (
|
|
@@ -62,7 +67,6 @@ from utils.vector_index import (
|
|
| 62 |
create_sparse_embeddings,
|
| 63 |
hybrid_score_norm,
|
| 64 |
)
|
| 65 |
-
from utils import nltkmodules
|
| 66 |
|
| 67 |
st.title("Question Answering on Earnings Call Transcripts")
|
| 68 |
|
|
@@ -75,6 +79,8 @@ col1, col2 = st.columns([3, 3], gap="medium")
|
|
| 75 |
|
| 76 |
|
| 77 |
with st.sidebar:
|
|
|
|
|
|
|
| 78 |
ner_choice = st.selectbox("Select NER Model", ["Spacy", "Alpaca"])
|
| 79 |
document_type = st.selectbox(
|
| 80 |
"Select Query Type", ["Single-Document", "Multi-Document"]
|
|
@@ -85,6 +91,18 @@ with st.sidebar:
|
|
| 85 |
["Single-Company", "Compare Companies"],
|
| 86 |
)
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
if ner_choice == "Spacy":
|
| 89 |
ner_model = get_spacy_model()
|
| 90 |
|
|
@@ -305,7 +323,7 @@ elif encoder_model == "Instructor":
|
|
| 305 |
)
|
| 306 |
pinecone_index_name = "week13-instructor-xl"
|
| 307 |
pinecone_index = pinecone.Index(pinecone_index_name)
|
| 308 |
-
retriever_model =
|
| 309 |
instruction = (
|
| 310 |
"Represent the financial question for retrieving supporting documents:"
|
| 311 |
)
|
|
@@ -318,7 +336,7 @@ elif encoder_model == "Hybrid Instructor - SPLADE":
|
|
| 318 |
)
|
| 319 |
pinecone_index_name = "week13-splade-instructor-xl"
|
| 320 |
pinecone_index = pinecone.Index(pinecone_index_name)
|
| 321 |
-
retriever_model =
|
| 322 |
(
|
| 323 |
sparse_retriever_model,
|
| 324 |
sparse_retriever_tokenizer,
|
|
@@ -382,6 +400,7 @@ if document_type == "Single-Document":
|
|
| 382 |
dense_query_embedding, sparse_query_embedding = hybrid_score_norm(
|
| 383 |
dense_query_embedding, sparse_query_embedding, 0.3
|
| 384 |
)
|
|
|
|
| 385 |
query_results = query_pinecone_sparse(
|
| 386 |
dense_query_embedding,
|
| 387 |
sparse_query_embedding,
|
|
@@ -392,6 +411,7 @@ if document_type == "Single-Document":
|
|
| 392 |
ticker,
|
| 393 |
participant_type,
|
| 394 |
keywords,
|
|
|
|
| 395 |
threshold,
|
| 396 |
)
|
| 397 |
|
|
@@ -413,6 +433,7 @@ if document_type == "Single-Document":
|
|
| 413 |
ticker,
|
| 414 |
participant_type,
|
| 415 |
keywords,
|
|
|
|
| 416 |
threshold,
|
| 417 |
)
|
| 418 |
|
|
@@ -459,6 +480,7 @@ else:
|
|
| 459 |
ticker,
|
| 460 |
participant_type,
|
| 461 |
keywords,
|
|
|
|
| 462 |
threshold,
|
| 463 |
)
|
| 464 |
results_list = sentence_id_combine(
|
|
@@ -490,6 +512,7 @@ else:
|
|
| 490 |
ticker,
|
| 491 |
participant_type,
|
| 492 |
keywords,
|
|
|
|
| 493 |
threshold,
|
| 494 |
)
|
| 495 |
results_list = sentence_id_combine(
|
|
@@ -535,6 +558,7 @@ else:
|
|
| 535 |
ticker_first,
|
| 536 |
participant_type,
|
| 537 |
keywords,
|
|
|
|
| 538 |
threshold,
|
| 539 |
)
|
| 540 |
results_list = sentence_id_combine(
|
|
@@ -557,6 +581,7 @@ else:
|
|
| 557 |
ticker_second,
|
| 558 |
participant_type,
|
| 559 |
keywords,
|
|
|
|
| 560 |
threshold,
|
| 561 |
)
|
| 562 |
results_list = sentence_id_combine(
|
|
@@ -591,6 +616,7 @@ else:
|
|
| 591 |
ticker_first,
|
| 592 |
participant_type,
|
| 593 |
keywords,
|
|
|
|
| 594 |
threshold,
|
| 595 |
)
|
| 596 |
results_list = sentence_id_combine(
|
|
@@ -612,6 +638,7 @@ else:
|
|
| 612 |
ticker_second,
|
| 613 |
participant_type,
|
| 614 |
keywords,
|
|
|
|
| 615 |
threshold,
|
| 616 |
)
|
| 617 |
results_list = sentence_id_combine(
|
|
@@ -778,7 +805,7 @@ if decoder_model == "GPT-J":
|
|
| 778 |
)
|
| 779 |
submitted = st.form_submit_button("Submit")
|
| 780 |
|
| 781 |
-
tab1, tab2 = st.tabs(["
|
| 782 |
|
| 783 |
|
| 784 |
with tab1:
|
|
|
|
| 1 |
import re
|
| 2 |
+
import numpy as np
|
| 3 |
import openai
|
| 4 |
import streamlit_scrollable_textbox as stx
|
| 5 |
|
|
|
|
| 8 |
|
| 9 |
st.set_page_config(layout="wide") # isort: split
|
| 10 |
|
| 11 |
+
from utils import nltkmodules
|
| 12 |
from utils.entity_extraction import (
|
| 13 |
clean_entities,
|
| 14 |
+
extract_keywords,
|
| 15 |
extract_quarter_year,
|
| 16 |
extract_ticker_spacy,
|
| 17 |
format_entities_flan_alpaca,
|
| 18 |
generate_alpaca_ner_prompt,
|
|
|
|
| 19 |
)
|
| 20 |
from utils.models import (
|
| 21 |
generate_entities_flan_alpaca_checkpoint,
|
| 22 |
generate_entities_flan_alpaca_inference_api,
|
| 23 |
generate_text_flan_t5,
|
|
|
|
| 24 |
get_alpaca_model,
|
| 25 |
+
get_data,
|
| 26 |
get_flan_alpaca_xl_model,
|
| 27 |
get_flan_t5_model,
|
| 28 |
get_instructor_embedding_model,
|
| 29 |
+
get_instructor_embedding_model_api,
|
| 30 |
+
get_bm25_model,
|
| 31 |
+
preprocess_text,
|
| 32 |
get_mpnet_embedding_model,
|
| 33 |
get_sgpt_embedding_model,
|
| 34 |
get_spacy_model,
|
|
|
|
| 59 |
sentence_id_combine,
|
| 60 |
text_lookup,
|
| 61 |
year_quarter_range,
|
| 62 |
+
get_bm25_search_hits,
|
| 63 |
)
|
| 64 |
from utils.transcript_retrieval import retrieve_transcript
|
| 65 |
from utils.vector_index import (
|
|
|
|
| 67 |
create_sparse_embeddings,
|
| 68 |
hybrid_score_norm,
|
| 69 |
)
|
|
|
|
| 70 |
|
| 71 |
st.title("Question Answering on Earnings Call Transcripts")
|
| 72 |
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
with st.sidebar:
|
| 82 |
+
use_bm25 = st.checkbox("Use BM25 for filtering results")
|
| 83 |
+
|
| 84 |
ner_choice = st.selectbox("Select NER Model", ["Spacy", "Alpaca"])
|
| 85 |
document_type = st.selectbox(
|
| 86 |
"Select Query Type", ["Single-Document", "Multi-Document"]
|
|
|
|
| 91 |
["Single-Company", "Compare Companies"],
|
| 92 |
)
|
| 93 |
|
| 94 |
+
|
| 95 |
+
corpus, bm25 = get_bm25_model(data)
|
| 96 |
+
|
| 97 |
+
tokenized_query = preprocess_text(query_text).split()
|
| 98 |
+
sparse_scores = np.argsort(bm25.get_scores(tokenized_query), axis=0)[::-1]
|
| 99 |
+
indices_hits = get_bm25_search_hits(corpus, sparse_scores, 50)
|
| 100 |
+
|
| 101 |
+
if use_bm25 == True:
|
| 102 |
+
indices = indices_hits
|
| 103 |
+
else:
|
| 104 |
+
indices = None
|
| 105 |
+
|
| 106 |
if ner_choice == "Spacy":
|
| 107 |
ner_model = get_spacy_model()
|
| 108 |
|
|
|
|
| 323 |
)
|
| 324 |
pinecone_index_name = "week13-instructor-xl"
|
| 325 |
pinecone_index = pinecone.Index(pinecone_index_name)
|
| 326 |
+
retriever_model = get_instructor_embedding_model_api()
|
| 327 |
instruction = (
|
| 328 |
"Represent the financial question for retrieving supporting documents:"
|
| 329 |
)
|
|
|
|
| 336 |
)
|
| 337 |
pinecone_index_name = "week13-splade-instructor-xl"
|
| 338 |
pinecone_index = pinecone.Index(pinecone_index_name)
|
| 339 |
+
retriever_model = get_instructor_embedding_model_api()
|
| 340 |
(
|
| 341 |
sparse_retriever_model,
|
| 342 |
sparse_retriever_tokenizer,
|
|
|
|
| 400 |
dense_query_embedding, sparse_query_embedding = hybrid_score_norm(
|
| 401 |
dense_query_embedding, sparse_query_embedding, 0.3
|
| 402 |
)
|
| 403 |
+
|
| 404 |
query_results = query_pinecone_sparse(
|
| 405 |
dense_query_embedding,
|
| 406 |
sparse_query_embedding,
|
|
|
|
| 411 |
ticker,
|
| 412 |
participant_type,
|
| 413 |
keywords,
|
| 414 |
+
indices,
|
| 415 |
threshold,
|
| 416 |
)
|
| 417 |
|
|
|
|
| 433 |
ticker,
|
| 434 |
participant_type,
|
| 435 |
keywords,
|
| 436 |
+
indices,
|
| 437 |
threshold,
|
| 438 |
)
|
| 439 |
|
|
|
|
| 480 |
ticker,
|
| 481 |
participant_type,
|
| 482 |
keywords,
|
| 483 |
+
indices,
|
| 484 |
threshold,
|
| 485 |
)
|
| 486 |
results_list = sentence_id_combine(
|
|
|
|
| 512 |
ticker,
|
| 513 |
participant_type,
|
| 514 |
keywords,
|
| 515 |
+
indices,
|
| 516 |
threshold,
|
| 517 |
)
|
| 518 |
results_list = sentence_id_combine(
|
|
|
|
| 558 |
ticker_first,
|
| 559 |
participant_type,
|
| 560 |
keywords,
|
| 561 |
+
indices,
|
| 562 |
threshold,
|
| 563 |
)
|
| 564 |
results_list = sentence_id_combine(
|
|
|
|
| 581 |
ticker_second,
|
| 582 |
participant_type,
|
| 583 |
keywords,
|
| 584 |
+
indices,
|
| 585 |
threshold,
|
| 586 |
)
|
| 587 |
results_list = sentence_id_combine(
|
|
|
|
| 616 |
ticker_first,
|
| 617 |
participant_type,
|
| 618 |
keywords,
|
| 619 |
+
indices,
|
| 620 |
threshold,
|
| 621 |
)
|
| 622 |
results_list = sentence_id_combine(
|
|
|
|
| 638 |
ticker_second,
|
| 639 |
participant_type,
|
| 640 |
keywords,
|
| 641 |
+
indices,
|
| 642 |
threshold,
|
| 643 |
)
|
| 644 |
results_list = sentence_id_combine(
|
|
|
|
| 805 |
)
|
| 806 |
submitted = st.form_submit_button("Submit")
|
| 807 |
|
| 808 |
+
tab1, tab2 = st.tabs(["Retrieved Text", "Retrieved Documents"])
|
| 809 |
|
| 810 |
|
| 811 |
with tab1:
|
requirements.txt
CHANGED
|
@@ -14,4 +14,4 @@ streamlit-scrollable-textbox
|
|
| 14 |
openai
|
| 15 |
InstructorEmbedding
|
| 16 |
gradio_client
|
| 17 |
-
|
|
|
|
| 14 |
openai
|
| 15 |
InstructorEmbedding
|
| 16 |
gradio_client
|
| 17 |
+
rank_bm25
|
utils/models.py
CHANGED
|
@@ -20,26 +20,59 @@ from transformers import (
|
|
| 20 |
T5Tokenizer,
|
| 21 |
pipeline,
|
| 22 |
)
|
| 23 |
-
|
| 24 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
import streamlit as st
|
| 26 |
|
| 27 |
|
| 28 |
-
@st.
|
| 29 |
def get_data():
|
| 30 |
data = pd.read_csv("earnings_calls_cleaned_metadata.csv")
|
| 31 |
return data
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# Initialize Spacy Model
|
| 35 |
|
| 36 |
|
| 37 |
-
@st.
|
| 38 |
def get_spacy_model():
|
| 39 |
return spacy.load("en_core_web_trf")
|
| 40 |
|
| 41 |
|
| 42 |
-
@st.
|
| 43 |
def get_flan_alpaca_xl_model():
|
| 44 |
model = AutoModelForSeq2SeqLM.from_pretrained(
|
| 45 |
"/home/user/app/models/flan-alpaca-xl/"
|
|
@@ -53,19 +86,19 @@ def get_flan_alpaca_xl_model():
|
|
| 53 |
# Initialize models from HuggingFace
|
| 54 |
|
| 55 |
|
| 56 |
-
@st.
|
| 57 |
def get_t5_model():
|
| 58 |
return pipeline("summarization", model="t5-small", tokenizer="t5-small")
|
| 59 |
|
| 60 |
|
| 61 |
-
@st.
|
| 62 |
def get_flan_t5_model():
|
| 63 |
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
|
| 64 |
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
|
| 65 |
return model, tokenizer
|
| 66 |
|
| 67 |
|
| 68 |
-
@st.
|
| 69 |
def get_mpnet_embedding_model():
|
| 70 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 71 |
model = SentenceTransformer(
|
|
@@ -75,7 +108,7 @@ def get_mpnet_embedding_model():
|
|
| 75 |
return model
|
| 76 |
|
| 77 |
|
| 78 |
-
@st.
|
| 79 |
def get_splade_sparse_embedding_model():
|
| 80 |
model_sparse = "naver/splade-cocondenser-ensembledistil"
|
| 81 |
# check device
|
|
@@ -87,7 +120,7 @@ def get_splade_sparse_embedding_model():
|
|
| 87 |
return model_sparse, tokenizer
|
| 88 |
|
| 89 |
|
| 90 |
-
@st.
|
| 91 |
def get_sgpt_embedding_model():
|
| 92 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 93 |
model = SentenceTransformer(
|
|
@@ -97,20 +130,34 @@ def get_sgpt_embedding_model():
|
|
| 97 |
return model
|
| 98 |
|
| 99 |
|
| 100 |
-
@st.
|
| 101 |
def get_instructor_embedding_model():
|
| 102 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 103 |
model = INSTRUCTOR("hkunlp/instructor-xl")
|
| 104 |
return model
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
@st.
|
| 108 |
def get_alpaca_model():
|
| 109 |
client = Client("https://awinml-alpaca-cpp.hf.space")
|
| 110 |
return client
|
| 111 |
|
| 112 |
|
| 113 |
-
@st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
def save_key(api_key):
|
| 115 |
return api_key
|
| 116 |
|
|
|
|
| 20 |
T5Tokenizer,
|
| 21 |
pipeline,
|
| 22 |
)
|
| 23 |
+
from rank_bm25 import BM25Okapi, BM25L, BM25Plus
|
| 24 |
+
import numpy as np
|
| 25 |
+
from nltk.tokenize import word_tokenize
|
| 26 |
+
from nltk.corpus import stopwords
|
| 27 |
+
from nltk.stem.porter import PorterStemmer
|
| 28 |
+
import re
|
| 29 |
import streamlit as st
|
| 30 |
|
| 31 |
|
| 32 |
+
@st.cache_resource
|
| 33 |
def get_data():
|
| 34 |
data = pd.read_csv("earnings_calls_cleaned_metadata.csv")
|
| 35 |
return data
|
| 36 |
|
| 37 |
|
| 38 |
+
# Preprocessing for BM25
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def tokenizer(
|
| 42 |
+
string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}"
|
| 43 |
+
):
|
| 44 |
+
regex = reg
|
| 45 |
+
string = string.replace("-", " ")
|
| 46 |
+
return " ".join(re.findall(regex, string))
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def preprocess_text(text):
|
| 50 |
+
# Convert to lowercase
|
| 51 |
+
text = text.lower()
|
| 52 |
+
# Tokenize the text
|
| 53 |
+
tokens = word_tokenize(text)
|
| 54 |
+
# Remove stop words
|
| 55 |
+
stop_words = set(stopwords.words("english"))
|
| 56 |
+
tokens = [token for token in tokens if token not in stop_words]
|
| 57 |
+
# Stem the tokens
|
| 58 |
+
porter_stemmer = PorterStemmer()
|
| 59 |
+
tokens = [porter_stemmer.stem(token) for token in tokens]
|
| 60 |
+
# Join the tokens back into a single string
|
| 61 |
+
preprocessed_text = " ".join(tokens)
|
| 62 |
+
preprocessed_text = tokenizer(preprocessed_text)
|
| 63 |
+
|
| 64 |
+
return preprocessed_text
|
| 65 |
+
|
| 66 |
+
|
| 67 |
# Initialize Spacy Model
|
| 68 |
|
| 69 |
|
| 70 |
+
@st.cache_resource
|
| 71 |
def get_spacy_model():
|
| 72 |
return spacy.load("en_core_web_trf")
|
| 73 |
|
| 74 |
|
| 75 |
+
@st.cache_resource
|
| 76 |
def get_flan_alpaca_xl_model():
|
| 77 |
model = AutoModelForSeq2SeqLM.from_pretrained(
|
| 78 |
"/home/user/app/models/flan-alpaca-xl/"
|
|
|
|
| 86 |
# Initialize models from HuggingFace
|
| 87 |
|
| 88 |
|
| 89 |
+
@st.cache_resource
|
| 90 |
def get_t5_model():
|
| 91 |
return pipeline("summarization", model="t5-small", tokenizer="t5-small")
|
| 92 |
|
| 93 |
|
| 94 |
+
@st.cache_resource
|
| 95 |
def get_flan_t5_model():
|
| 96 |
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
|
| 97 |
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
|
| 98 |
return model, tokenizer
|
| 99 |
|
| 100 |
|
| 101 |
+
@st.cache_resource
|
| 102 |
def get_mpnet_embedding_model():
|
| 103 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 104 |
model = SentenceTransformer(
|
|
|
|
| 108 |
return model
|
| 109 |
|
| 110 |
|
| 111 |
+
@st.cache_resource
|
| 112 |
def get_splade_sparse_embedding_model():
|
| 113 |
model_sparse = "naver/splade-cocondenser-ensembledistil"
|
| 114 |
# check device
|
|
|
|
| 120 |
return model_sparse, tokenizer
|
| 121 |
|
| 122 |
|
| 123 |
+
@st.cache_resource
|
| 124 |
def get_sgpt_embedding_model():
|
| 125 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 126 |
model = SentenceTransformer(
|
|
|
|
| 130 |
return model
|
| 131 |
|
| 132 |
|
| 133 |
+
@st.cache_resource
|
| 134 |
def get_instructor_embedding_model():
|
| 135 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 136 |
model = INSTRUCTOR("hkunlp/instructor-xl")
|
| 137 |
return model
|
| 138 |
|
| 139 |
+
@st.cache_resource
|
| 140 |
+
def get_instructor_embedding_model_api():
|
| 141 |
+
client = Client("https://awinml-api-instructor-xl-2.hf.space/")
|
| 142 |
+
return client
|
| 143 |
+
|
| 144 |
|
| 145 |
+
@st.cache_resource
|
| 146 |
def get_alpaca_model():
|
| 147 |
client = Client("https://awinml-alpaca-cpp.hf.space")
|
| 148 |
return client
|
| 149 |
|
| 150 |
|
| 151 |
+
@st.cache_resource
|
| 152 |
+
def get_bm25_model(data):
|
| 153 |
+
corpus = data.Text.tolist()
|
| 154 |
+
corpus_clean = [preprocess_text(x) for x in corpus]
|
| 155 |
+
tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
|
| 156 |
+
bm25 = BM25Plus(tokenized_corpus)
|
| 157 |
+
return corpus, bm25
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@st.cache_resource
|
| 161 |
def save_key(api_key):
|
| 162 |
return api_key
|
| 163 |
|
utils/nltkmodules.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import nltk
|
| 2 |
|
| 3 |
-
nltk.download(
|
| 4 |
-
nltk.download(
|
|
|
|
|
|
| 1 |
import nltk
|
| 2 |
|
| 3 |
+
nltk.download("wordnet")
|
| 4 |
+
nltk.download("punkt")
|
| 5 |
+
nltk.download("stopwords")
|
utils/retriever.py
CHANGED
|
@@ -1,6 +1,16 @@
|
|
| 1 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
dense_vec,
|
| 3 |
-
sparse_vec,
|
| 4 |
top_k,
|
| 5 |
index,
|
| 6 |
year,
|
|
@@ -8,6 +18,7 @@ def query_pinecone_sparse(
|
|
| 8 |
ticker,
|
| 9 |
participant_type,
|
| 10 |
keywords=None,
|
|
|
|
| 11 |
threshold=0.25,
|
| 12 |
):
|
| 13 |
if participant_type == "Company Speaker":
|
|
@@ -16,68 +27,126 @@ def query_pinecone_sparse(
|
|
| 16 |
participant = "Question"
|
| 17 |
|
| 18 |
# Create filter dictionary based on keywords
|
| 19 |
-
filter_dict = [{
|
| 20 |
|
| 21 |
if year == "All":
|
| 22 |
if quarter == "All":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
xc = index.query(
|
| 24 |
vector=dense_vec,
|
| 25 |
-
sparse_vector=sparse_vec,
|
| 26 |
top_k=top_k,
|
| 27 |
filter={
|
| 28 |
-
"Year":
|
| 29 |
-
|
| 30 |
-
int("2020"),
|
| 31 |
-
int("2019"),
|
| 32 |
-
int("2018"),
|
| 33 |
-
int("2017"),
|
| 34 |
-
int("2016"),
|
| 35 |
-
]
|
| 36 |
-
},
|
| 37 |
-
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
| 38 |
"Ticker": {"$eq": ticker},
|
| 39 |
"QA_Flag": {"$eq": participant},
|
| 40 |
-
"Keywords": {"$in": keywords}
|
|
|
|
| 41 |
},
|
| 42 |
include_metadata=True,
|
| 43 |
)
|
| 44 |
else:
|
| 45 |
xc = index.query(
|
| 46 |
vector=dense_vec,
|
| 47 |
-
sparse_vector=sparse_vec,
|
| 48 |
top_k=top_k,
|
| 49 |
filter={
|
| 50 |
-
"Year":
|
| 51 |
-
"$in": [
|
| 52 |
-
int("2020"),
|
| 53 |
-
int("2019"),
|
| 54 |
-
int("2018"),
|
| 55 |
-
int("2017"),
|
| 56 |
-
int("2016"),
|
| 57 |
-
]
|
| 58 |
-
},
|
| 59 |
"Quarter": {"$eq": quarter},
|
| 60 |
"Ticker": {"$eq": ticker},
|
| 61 |
"QA_Flag": {"$eq": participant},
|
| 62 |
-
"Keywords": {"$in": keywords}
|
| 63 |
},
|
| 64 |
include_metadata=True,
|
| 65 |
)
|
| 66 |
-
else:
|
| 67 |
-
# search pinecone index for context passage with the answer
|
| 68 |
-
xc = index.query(
|
| 69 |
-
vector=dense_vec,
|
| 70 |
-
sparse_vector=sparse_vec,
|
| 71 |
-
top_k=top_k,
|
| 72 |
-
filter={
|
| 73 |
-
"Year": int(year),
|
| 74 |
-
"Quarter": {"$eq": quarter},
|
| 75 |
-
"Ticker": {"$eq": ticker},
|
| 76 |
-
"QA_Flag": {"$eq": participant},
|
| 77 |
-
"Keywords": {"$in": keywords}
|
| 78 |
-
},
|
| 79 |
-
include_metadata=True,
|
| 80 |
-
)
|
| 81 |
# filter the context passages based on the score threshold
|
| 82 |
filtered_matches = []
|
| 83 |
for match in xc["matches"]:
|
|
@@ -87,8 +156,9 @@ def query_pinecone_sparse(
|
|
| 87 |
return xc
|
| 88 |
|
| 89 |
|
| 90 |
-
def
|
| 91 |
dense_vec,
|
|
|
|
| 92 |
top_k,
|
| 93 |
index,
|
| 94 |
year,
|
|
@@ -96,6 +166,7 @@ def query_pinecone(
|
|
| 96 |
ticker,
|
| 97 |
participant_type,
|
| 98 |
keywords=None,
|
|
|
|
| 99 |
threshold=0.25,
|
| 100 |
):
|
| 101 |
if participant_type == "Company Speaker":
|
|
@@ -104,13 +175,13 @@ def query_pinecone(
|
|
| 104 |
participant = "Question"
|
| 105 |
|
| 106 |
# Create filter dictionary based on keywords
|
| 107 |
-
filter_dict = [{
|
| 108 |
-
|
| 109 |
|
| 110 |
if year == "All":
|
| 111 |
if quarter == "All":
|
| 112 |
xc = index.query(
|
| 113 |
vector=dense_vec,
|
|
|
|
| 114 |
top_k=top_k,
|
| 115 |
filter={
|
| 116 |
"Year": {
|
|
@@ -125,13 +196,14 @@ def query_pinecone(
|
|
| 125 |
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
| 126 |
"Ticker": {"$eq": ticker},
|
| 127 |
"QA_Flag": {"$eq": participant},
|
| 128 |
-
"Keywords": {"$in": keywords}
|
| 129 |
},
|
| 130 |
include_metadata=True,
|
| 131 |
)
|
| 132 |
else:
|
| 133 |
xc = index.query(
|
| 134 |
vector=dense_vec,
|
|
|
|
| 135 |
top_k=top_k,
|
| 136 |
filter={
|
| 137 |
"Year": {
|
|
@@ -146,7 +218,7 @@ def query_pinecone(
|
|
| 146 |
"Quarter": {"$eq": quarter},
|
| 147 |
"Ticker": {"$eq": ticker},
|
| 148 |
"QA_Flag": {"$eq": participant},
|
| 149 |
-
"Keywords": {"$in": keywords}
|
| 150 |
},
|
| 151 |
include_metadata=True,
|
| 152 |
)
|
|
@@ -154,13 +226,14 @@ def query_pinecone(
|
|
| 154 |
# search pinecone index for context passage with the answer
|
| 155 |
xc = index.query(
|
| 156 |
vector=dense_vec,
|
|
|
|
| 157 |
top_k=top_k,
|
| 158 |
filter={
|
| 159 |
"Year": int(year),
|
| 160 |
"Quarter": {"$eq": quarter},
|
| 161 |
"Ticker": {"$eq": ticker},
|
| 162 |
"QA_Flag": {"$eq": participant},
|
| 163 |
-
"Keywords": {"$in": keywords}
|
| 164 |
},
|
| 165 |
include_metadata=True,
|
| 166 |
)
|
|
|
|
| 1 |
+
def get_bm25_search_hits(corpus, sparse_scores, top_n=50):
|
| 2 |
+
bm25_search = []
|
| 3 |
+
indices = []
|
| 4 |
+
for idx in sparse_scores:
|
| 5 |
+
if len(bm25_search) <= top_n:
|
| 6 |
+
bm25_search.append(corpus[idx])
|
| 7 |
+
indices.append(idx)
|
| 8 |
+
indices = [int(x) for x in indices]
|
| 9 |
+
return indices
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def query_pinecone(
|
| 13 |
dense_vec,
|
|
|
|
| 14 |
top_k,
|
| 15 |
index,
|
| 16 |
year,
|
|
|
|
| 18 |
ticker,
|
| 19 |
participant_type,
|
| 20 |
keywords=None,
|
| 21 |
+
indices=None,
|
| 22 |
threshold=0.25,
|
| 23 |
):
|
| 24 |
if participant_type == "Company Speaker":
|
|
|
|
| 27 |
participant = "Question"
|
| 28 |
|
| 29 |
# Create filter dictionary based on keywords
|
| 30 |
+
filter_dict = [{"Keywords": word} for word in keywords]
|
| 31 |
|
| 32 |
if year == "All":
|
| 33 |
if quarter == "All":
|
| 34 |
+
if indices != None:
|
| 35 |
+
xc = index.query(
|
| 36 |
+
vector=dense_vec,
|
| 37 |
+
top_k=top_k,
|
| 38 |
+
filter={
|
| 39 |
+
"Year": {
|
| 40 |
+
"$in": [
|
| 41 |
+
int("2020"),
|
| 42 |
+
int("2019"),
|
| 43 |
+
int("2018"),
|
| 44 |
+
int("2017"),
|
| 45 |
+
int("2016"),
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
| 49 |
+
"Ticker": {"$eq": ticker},
|
| 50 |
+
"QA_Flag": {"$eq": participant},
|
| 51 |
+
"Keywords": {"$in": keywords},
|
| 52 |
+
"index": {"$in": indices},
|
| 53 |
+
},
|
| 54 |
+
include_metadata=True,
|
| 55 |
+
)
|
| 56 |
+
else:
|
| 57 |
+
xc = index.query(
|
| 58 |
+
vector=dense_vec,
|
| 59 |
+
top_k=top_k,
|
| 60 |
+
filter={
|
| 61 |
+
"Year": {
|
| 62 |
+
"$in": [
|
| 63 |
+
int("2020"),
|
| 64 |
+
int("2019"),
|
| 65 |
+
int("2018"),
|
| 66 |
+
int("2017"),
|
| 67 |
+
int("2016"),
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
| 71 |
+
"Ticker": {"$eq": ticker},
|
| 72 |
+
"QA_Flag": {"$eq": participant},
|
| 73 |
+
"Keywords": {"$in": keywords},
|
| 74 |
+
},
|
| 75 |
+
include_metadata=True,
|
| 76 |
+
)
|
| 77 |
+
else:
|
| 78 |
+
if indices != None:
|
| 79 |
+
xc = index.query(
|
| 80 |
+
vector=dense_vec,
|
| 81 |
+
top_k=top_k,
|
| 82 |
+
filter={
|
| 83 |
+
"Year": {
|
| 84 |
+
"$in": [
|
| 85 |
+
int("2020"),
|
| 86 |
+
int("2019"),
|
| 87 |
+
int("2018"),
|
| 88 |
+
int("2017"),
|
| 89 |
+
int("2016"),
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
"Quarter": {"$eq": quarter},
|
| 93 |
+
"Ticker": {"$eq": ticker},
|
| 94 |
+
"QA_Flag": {"$eq": participant},
|
| 95 |
+
"Keywords": {"$in": keywords},
|
| 96 |
+
"index": {"$in": indices},
|
| 97 |
+
},
|
| 98 |
+
include_metadata=True,
|
| 99 |
+
)
|
| 100 |
+
else:
|
| 101 |
+
xc = index.query(
|
| 102 |
+
vector=dense_vec,
|
| 103 |
+
top_k=top_k,
|
| 104 |
+
filter={
|
| 105 |
+
"Year": {
|
| 106 |
+
"$in": [
|
| 107 |
+
int("2020"),
|
| 108 |
+
int("2019"),
|
| 109 |
+
int("2018"),
|
| 110 |
+
int("2017"),
|
| 111 |
+
int("2016"),
|
| 112 |
+
]
|
| 113 |
+
},
|
| 114 |
+
"Quarter": {"$eq": quarter},
|
| 115 |
+
"Ticker": {"$eq": ticker},
|
| 116 |
+
"QA_Flag": {"$eq": participant},
|
| 117 |
+
"Keywords": {"$in": keywords},
|
| 118 |
+
},
|
| 119 |
+
include_metadata=True,
|
| 120 |
+
)
|
| 121 |
+
else:
|
| 122 |
+
# search pinecone index for context passage with the answer
|
| 123 |
+
if indices != None:
|
| 124 |
xc = index.query(
|
| 125 |
vector=dense_vec,
|
|
|
|
| 126 |
top_k=top_k,
|
| 127 |
filter={
|
| 128 |
+
"Year": int(year),
|
| 129 |
+
"Quarter": {"$eq": quarter},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
"Ticker": {"$eq": ticker},
|
| 131 |
"QA_Flag": {"$eq": participant},
|
| 132 |
+
"Keywords": {"$in": keywords},
|
| 133 |
+
"index": {"$in": indices},
|
| 134 |
},
|
| 135 |
include_metadata=True,
|
| 136 |
)
|
| 137 |
else:
|
| 138 |
xc = index.query(
|
| 139 |
vector=dense_vec,
|
|
|
|
| 140 |
top_k=top_k,
|
| 141 |
filter={
|
| 142 |
+
"Year": int(year),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
"Quarter": {"$eq": quarter},
|
| 144 |
"Ticker": {"$eq": ticker},
|
| 145 |
"QA_Flag": {"$eq": participant},
|
| 146 |
+
"Keywords": {"$in": keywords},
|
| 147 |
},
|
| 148 |
include_metadata=True,
|
| 149 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
# filter the context passages based on the score threshold
|
| 151 |
filtered_matches = []
|
| 152 |
for match in xc["matches"]:
|
|
|
|
| 156 |
return xc
|
| 157 |
|
| 158 |
|
| 159 |
+
def query_pinecone_sparse(
|
| 160 |
dense_vec,
|
| 161 |
+
sparse_vec,
|
| 162 |
top_k,
|
| 163 |
index,
|
| 164 |
year,
|
|
|
|
| 166 |
ticker,
|
| 167 |
participant_type,
|
| 168 |
keywords=None,
|
| 169 |
+
indices=None,
|
| 170 |
threshold=0.25,
|
| 171 |
):
|
| 172 |
if participant_type == "Company Speaker":
|
|
|
|
| 175 |
participant = "Question"
|
| 176 |
|
| 177 |
# Create filter dictionary based on keywords
|
| 178 |
+
filter_dict = [{"Keywords": word} for word in keywords]
|
|
|
|
| 179 |
|
| 180 |
if year == "All":
|
| 181 |
if quarter == "All":
|
| 182 |
xc = index.query(
|
| 183 |
vector=dense_vec,
|
| 184 |
+
sparse_vector=sparse_vec,
|
| 185 |
top_k=top_k,
|
| 186 |
filter={
|
| 187 |
"Year": {
|
|
|
|
| 196 |
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
| 197 |
"Ticker": {"$eq": ticker},
|
| 198 |
"QA_Flag": {"$eq": participant},
|
| 199 |
+
"Keywords": {"$in": keywords},
|
| 200 |
},
|
| 201 |
include_metadata=True,
|
| 202 |
)
|
| 203 |
else:
|
| 204 |
xc = index.query(
|
| 205 |
vector=dense_vec,
|
| 206 |
+
sparse_vector=sparse_vec,
|
| 207 |
top_k=top_k,
|
| 208 |
filter={
|
| 209 |
"Year": {
|
|
|
|
| 218 |
"Quarter": {"$eq": quarter},
|
| 219 |
"Ticker": {"$eq": ticker},
|
| 220 |
"QA_Flag": {"$eq": participant},
|
| 221 |
+
"Keywords": {"$in": keywords},
|
| 222 |
},
|
| 223 |
include_metadata=True,
|
| 224 |
)
|
|
|
|
| 226 |
# search pinecone index for context passage with the answer
|
| 227 |
xc = index.query(
|
| 228 |
vector=dense_vec,
|
| 229 |
+
sparse_vector=sparse_vec,
|
| 230 |
top_k=top_k,
|
| 231 |
filter={
|
| 232 |
"Year": int(year),
|
| 233 |
"Quarter": {"$eq": quarter},
|
| 234 |
"Ticker": {"$eq": ticker},
|
| 235 |
"QA_Flag": {"$eq": participant},
|
| 236 |
+
"Keywords": {"$in": keywords},
|
| 237 |
},
|
| 238 |
include_metadata=True,
|
| 239 |
)
|
utils/vector_index.py
CHANGED
|
@@ -1,11 +1,23 @@
|
|
| 1 |
import torch
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def create_dense_embeddings(query, model, instruction=None):
|
| 5 |
if instruction == None:
|
| 6 |
dense_emb = model.encode([query]).tolist()
|
| 7 |
else:
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
return dense_emb
|
| 10 |
|
| 11 |
|
|
|
|
| 1 |
import torch
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
|
| 5 |
|
| 6 |
def create_dense_embeddings(query, model, instruction=None):
|
| 7 |
if instruction == None:
|
| 8 |
dense_emb = model.encode([query]).tolist()
|
| 9 |
else:
|
| 10 |
+
# Fetching embedding from API for Instructor
|
| 11 |
+
json_output_embedding = model.predict(
|
| 12 |
+
instruction,
|
| 13 |
+
query,
|
| 14 |
+
api_name="/predict",
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
json_file = open(json_output_embedding, "r")
|
| 18 |
+
json_dict = json.load(json_file)
|
| 19 |
+
dense_array = np.array(json_dict["data"], dtype=np.float64)
|
| 20 |
+
dense_emb = dense_array.tolist()
|
| 21 |
return dense_emb
|
| 22 |
|
| 23 |
|