diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..8e41cd9b5edb0445264fd480d5109de0787ac969 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +**/__pycache__/ +*.DS_Store + diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100755 index 0000000000000000000000000000000000000000..f65b0110de1c152497f18b09738c79666c844e6b --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,21 @@ + +[client] +toolbarMode = "viewer" +showSidebarNavigation = false +showErrorDetails = true + +[browser] +gatherUsageStats = false + +[theme] +base="dark" +font="sans serif" +primaryColor="#e28743" +backgroundColor ="#000000" + +[global] +disableWidgetStateDuplicationWarning = true +showWarningOnDirectExecution = false + +[server] +enableXsrfProtection=false \ No newline at end of file diff --git a/RAG/bedrock_agent.py b/RAG/bedrock_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..52e78ee47626a0a0abaa0428bd5adaf505e003b4 --- /dev/null +++ b/RAG/bedrock_agent.py @@ -0,0 +1,146 @@ +import boto3 +import json +import time +import zipfile +from io import BytesIO +import uuid +import pprint +import logging +print(boto3.__version__) +from PIL import Image +import os +import base64 +import re +import requests +import utilities.re_ranker as re_ranker +import utilities.invoke_models as invoke_models +import streamlit as st +import time as t +import botocore.exceptions + +if "inputs_" not in st.session_state: + st.session_state.inputs_ = {} + +parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) +region = 'us-east-1' +print(region) +account_id = '445083327804' +# setting logger +logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO) +logger = logging.getLogger(__name__) +# getting boto3 clients for required AWS services + +#bedrock_agent_client = boto3.client('bedrock-agent',region_name=region) +bedrock_agent_runtime_client = boto3.client( + 'bedrock-agent-runtime', + aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1' +) +enable_trace:bool = True +end_session:bool = False + +def delete_memory(): + response = bedrock_agent_runtime_client.delete_agent_memory( + agentAliasId='TSTALIASID', + agentId='B4Z7BTURC4' + ) + +def query_(inputs): + ## create a random id for session initiator id + + + # invoke the agent API + agentResponse = bedrock_agent_runtime_client.invoke_agent( + inputText=inputs['shopping_query'], + agentId='B4Z7BTURC4', + agentAliasId='TSTALIASID', + sessionId=st.session_state.session_id_, + enableTrace=enable_trace, + endSession= end_session + ) + + logger.info(pprint.pprint(agentResponse)) + print("***agent*****response*********") + print(agentResponse) + event_stream = agentResponse['completion'] + total_context = [] + last_tool = "" + last_tool_name = "" + agent_answer = "" + try: + for event in event_stream: + print("***event*********") + print(event) + # if 'chunk' in event: + # data = event['chunk']['bytes'] + # print("***chunk*********") + # print(data) + # logger.info(f"Final answer ->\n{data.decode('utf8')}") + # agent_answer_ = data.decode('utf8') + # print(agent_answer_) + if 'trace' in event: + print("trace*****total*********") + print(event['trace']) + if('orchestrationTrace' not in event['trace']['trace']): + continue + orchestration_trace = event['trace']['trace']['orchestrationTrace'] + total_context_item = {} + if('modelInvocationOutput' in orchestration_trace and '' in orchestration_trace['modelInvocationOutput']['rawResponse']['content']): + total_context_item['tool'] = orchestration_trace['modelInvocationOutput']['rawResponse'] + if('rationale' in orchestration_trace): + total_context_item['rationale'] = orchestration_trace['rationale']['text'] + if('invocationInput' in orchestration_trace): + total_context_item['invocationInput'] = orchestration_trace['invocationInput']['actionGroupInvocationInput'] + last_tool_name = total_context_item['invocationInput']['function'] + if('observation' in orchestration_trace): + print("trace****observation******") + total_context_item['observation'] = event['trace']['trace']['orchestrationTrace']['observation'] + tool_output_last_obs = event['trace']['trace']['orchestrationTrace']['observation'] + print(tool_output_last_obs) + if(tool_output_last_obs['type'] == 'ACTION_GROUP'): + last_tool = tool_output_last_obs['actionGroupInvocationOutput']['text'] + if(tool_output_last_obs['type'] == 'FINISH'): + agent_answer = tool_output_last_obs['finalResponse']['text'] + if('modelInvocationOutput' in orchestration_trace and '' in orchestration_trace['modelInvocationOutput']['rawResponse']['content']): + total_context_item['thinking'] = orchestration_trace['modelInvocationOutput']['rawResponse'] + if(total_context_item!={}): + total_context.append(total_context_item) + print("total_context------") + print(total_context) + except botocore.exceptions.EventStreamError as error: + raise error + # t.sleep(2) + # query_(st.session_state.inputs_) + + # if 'chunk' in event: + # data = event['chunk']['bytes'] + # final_ans = data.decode('utf8') + # print(f"Final answer ->\n{final_ans}") + # logger.info(f"Final answer ->\n{final_ans}") + # agent_answer = final_ans + # end_event_received = True + # # End event indicates that the request finished successfully + # elif 'trace' in event: + # logger.info(json.dumps(event['trace'], indent=2)) + # else: + # raise Exception("unexpected event.", event) + # except Exception as e: + # raise Exception("unexpected event.", e) + return {'text':agent_answer,'source':total_context,'last_tool':{'name':last_tool_name,'response':last_tool}} + + ####### Re-Rank ######## + + #print("re-rank") + + # if(st.session_state.input_is_rerank == True and len(total_context)): + # ques = [{"question":question}] + # ans = [{"answer":total_context}] + + # total_context = re_ranker.re_rank('rag','Cross Encoder',"",ques, ans) + + # llm_prompt = prompt_template.format(context=total_context[0],question=question) + # output = invoke_models.invoke_llm_model( "\n\nHuman: {input}\n\nAssistant:".format(input=llm_prompt) ,False) + # #print(output) + # if(len(images_2)==0): + # images_2 = images + # return {'text':output,'source':total_context,'image':images_2,'table':df} diff --git a/RAG/generate_csv_for_tables.py b/RAG/generate_csv_for_tables.py new file mode 100644 index 0000000000000000000000000000000000000000..da79ad995eec70fd5d3877c4d7c0e9f2ad94a33f --- /dev/null +++ b/RAG/generate_csv_for_tables.py @@ -0,0 +1,167 @@ +import os +import json +import boto3 +import io +from io import BytesIO +import sys +from pprint import pprint +from PyPDF2 import PdfWriter, PdfReader +import re +import shutil +import streamlit as st + +file_content = {} +parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) +# if os.path.isdir(parent_dirname+"/split_pdf"): +# shutil.rmtree(parent_dirname+"/split_pdf") +# os.mkdir(parent_dirname+"/split_pdf") + +# if os.path.isdir(parent_dirname+"/split_pdf_csv"): +# shutil.rmtree(parent_dirname+"/split_pdf_csv") +# os.mkdir(parent_dirname+"/split_pdf_csv") + + +def get_rows_columns_map(table_result, blocks_map): + rows = {} + #scores = [] + for relationship in table_result['Relationships']: + if relationship['Type'] == 'CHILD': + for child_id in relationship['Ids']: + cell = blocks_map[child_id] + if cell['BlockType'] == 'CELL': + row_index = cell['RowIndex'] + col_index = cell['ColumnIndex'] + if row_index not in rows: + # create new row + rows[row_index] = {} + + # get confidence score + #scores.append(str(cell['Confidence'])) + + # get the text value + rows[row_index][col_index] = get_text(cell, blocks_map) + return rows#, scores + + +def get_text(result, blocks_map): + text = '' + if 'Relationships' in result: + for relationship in result['Relationships']: + if relationship['Type'] == 'CHILD': + for child_id in relationship['Ids']: + word = blocks_map[child_id] + if word['BlockType'] == 'WORD': + if "," in word['Text'] and word['Text'].replace(",", "").isnumeric(): + text += '"' + word['Text'] + '"' +' ' + else: + text += word['Text'] +' ' + if word['BlockType'] == 'SELECTION_ELEMENT': + if word['SelectionStatus'] =='SELECTED': + text += 'X ' + return text + + +def split_pages(file_name): + + inputpdf = PdfReader(open(file_name, "rb")) + file_name_short = re.sub('[^A-Za-z0-9]+', '', (file_name.split("/")[-1].split(".")[0]).lower()) + + for i in range(len(inputpdf.pages)): + + output = PdfWriter() + output.add_page(inputpdf.pages[i]) + split_file = parent_dirname+"/split_pdf/"+file_name_short+"%s.pdf" % i + + with open(split_file, "wb") as outputStream: + output.write(outputStream) + table_csv = get_table_csv_results(split_file) + if(table_csv != " NO Table FOUND "): + + output_file = parent_dirname+"/split_pdf_csv/"+file_name_short+"%s.csv" % i + file_content[output_file] = table_csv + + # replace content + with open(output_file, "wt") as fout: + fout.write(table_csv) + + # show the results + print('CSV OUTPUT FILE: ', output_file) + return file_content + +def get_table_csv_results(file_name): + + with open(file_name, 'rb') as file: + img_test = file.read() + bytes_test = bytearray(img_test) + #print('Image loaded', file_name) + + # process using image bytes + # get the results + #session = boto3.Session(profile_name='profile-name') + client = boto3.client('textract',aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1') + # {'S3Object': { + # 'Bucket': 'ml-search-app-access', + # 'Name': 'covid19_ie_removed.pdf' + # }} + + response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES']) + + # Get the text blocks + blocks=response['Blocks'] + #pprint(blocks) + + blocks_map = {} + table_blocks = [] + for block in blocks: + blocks_map[block['Id']] = block + if block['BlockType'] == "TABLE": + table_blocks.append(block) + + if len(table_blocks) <= 0: + return " NO Table FOUND " + + csv = '' + for index, table in enumerate(table_blocks): + csv += generate_table_csv(table, blocks_map, index +1) + csv += '\n\n' + + + return csv + +def generate_table_csv(table_result, blocks_map, table_index): + rows = get_rows_columns_map(table_result, blocks_map) + + table_id = 'Table_' + str(table_index) + + # get cells. + csv = ''#Table: {0}\n\n'.format(table_id) + for row_index, cols in rows.items(): + for col_index, text in cols.items(): + col_indices = len(cols.items()) + csv += text.strip()+"`" #'{}'.format(text) + "," + csv += '\n' + + # csv += '\n\n Confidence Scores % (Table Cell) \n' + # cols_count = 0 + # for score in scores: + # cols_count += 1 + # csv += score + "," + # if cols_count == col_indices: + # csv += '\n' + # cols_count = 0 + + csv += '\n\n\n' + return csv + +def main_(file_name): + table_csv = split_pages(file_name) + #print(table_csv) + return table_csv + + + + +# if __name__ == "__main__": +# file_name = "/home/ubuntu/covid19_ie_removed.pdf" +# main(file_name) diff --git a/RAG/rag_DocumentLoader.py b/RAG/rag_DocumentLoader.py new file mode 100644 index 0000000000000000000000000000000000000000..d3f42c75c84f6dd67bafe5f884a7274767f10ee8 --- /dev/null +++ b/RAG/rag_DocumentLoader.py @@ -0,0 +1,395 @@ +import boto3 +import json +import os +import shutil +import time +from unstructured.partition.pdf import partition_pdf +from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth +import streamlit as st +from PIL import Image +import base64 +import re +#import torch +import base64 +import requests +from requests_aws4auth import AWS4Auth +import re_ranker +import utilities.invoke_models as invoke_models +from requests.auth import HTTPBasicAuth + +import generate_csv_for_tables +from pdf2image import convert_from_bytes,convert_from_path +#import langchain + +bedrock_runtime_client = boto3.client('bedrock-runtime',region_name='us-east-1') +textract_client = boto3.client('textract',region_name='us-east-1') + +region = 'us-east-1' +service = 'es' + +credentials = boto3.Session().get_credentials() +auth = HTTPBasicAuth('prasadnu',st.secrets['rag_shopping_assistant_os_api_access']) + +ospy_client = OpenSearch( + hosts = [{'host': 'search-opensearchservi-75ucark0bqob-bzk6r6h2t33dlnpgx2pdeg22gi.us-east-1.es.amazonaws.com', 'port': 443}], + http_auth = auth, + use_ssl = True, + verify_certs = True, + connection_class = RequestsHttpConnection, + pool_maxsize = 20 +) + + + +summary_prompt = """You are an assistant tasked with summarizing tables and text. \ +Give a detailed summary of the table or text. Table or text chunk: {element} """ + + +parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) + + + + +def generate_image_captions_(image_paths): + images = [] + for image_path in image_paths: + i_image = Image.open(image_path) + if i_image.mode != "RGB": + i_image = i_image.convert(mode="RGB") + + images.append(i_image) + + pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values + pixel_values = pixel_values.to(device) + + output_ids = model.generate(pixel_values, **gen_kwargs) + + preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + preds = [pred.strip() for pred in preds] + return preds + + + + +def load_docs(inp): + + print("input_doc") + print(inp) + extracted_elements_list = [] + + + data_dir = parent_dirname+"/pdfs" + target_files = [os.path.join(data_dir,inp["key"])] + + + + Image.MAX_IMAGE_PIXELS = 100000000 + width = 2048 + height = 2048 + + + for target_file in target_files: + tables_textract = generate_csv_for_tables.main_(target_file) + #tables_textract = {} + index_ = re.sub('[^A-Za-z0-9]+', '', (target_file.split("/")[-1].split(".")[0]).lower()) + st.session_state.input_index = index_ + + if os.path.isdir(parent_dirname+'/figures/') == False: + os.mkdir(parent_dirname+'/figures/') + + + + + + image_output_dir = parent_dirname+'/figures/'+st.session_state.input_index+"/" + + if os.path.isdir(image_output_dir): + shutil.rmtree(image_output_dir) + + + os.mkdir(image_output_dir) + + + print("***") + print(target_file) + #image_output_dir_path = os.path.join(image_output_dir,target_file.split('/')[-1].split('.')[0]) + #os.mkdir(image_output_dir_path) + + # with open(target_file, "rb") as pdf_file: + # encoded_string_pdf = bytearray(pdf_file.read()) + + #images_pdf = convert_from_path(target_file) + + # for index,image in enumerate(images_pdf): + # image.save(image_output_dir_pdf+"/"+st.session_state.input_index+"/"+str(index)+"_pdf.jpeg", 'JPEG') + # with open(image_output_dir_pdf+"/"+st.session_state.input_index+"/"+str(index)+"_pdf.jpeg", "rb") as read_img: + # input_encoded = base64.b64encode(read_img.read()) + # print(encoded_string_pdf) + # tables_= textract_client.analyze_document( + # Document={'Bytes': encoded_string_pdf}, + # FeatureTypes=['TABLES'] + # ) + + # print(tables_) + + table_and_text_elements = partition_pdf( + filename=target_file, + extract_images_in_pdf=True, + infer_table_structure=False, + chunking_strategy="by_title", #Uses title elements to identify sections within the document for chunking + max_characters=4000, + new_after_n_chars=3800, + combine_text_under_n_chars=2000, + extract_image_block_output_dir=parent_dirname+'/figures/'+st.session_state.input_index+'/', + ) + tables = [] + texts = [] + print(table_and_text_elements) + + + for table in tables_textract.keys(): + print(table) + #print(tables_textract[table]) + tables.append({'table_name':table,'raw':tables_textract[table],'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=tables_textract[table]),False)}) + time.sleep(4) + + + for element in table_and_text_elements: + # if "unstructured.documents.elements.Table" in str(type(element)): + # tables.append({'raw':str(element),'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=str(element)),False)}) + # tables_source.append({'raw':element,'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=str(element)),False)}) + + if "unstructured.documents.elements.CompositeElement" in str(type(element)): + texts.append(str(element)) + image_captions = {} + + + for image_file in os.listdir(image_output_dir): + print("image_processing") + + photo_full_path = image_output_dir+image_file + photo_full_path_no_format = photo_full_path.replace('.jpg',"") + + with Image.open(photo_full_path) as image: + image.verify() + + with Image.open(photo_full_path) as image: + + file_type = 'jpg' + path = image.filename.rsplit(".", 1)[0] + image.thumbnail((width, height)) + image.save(photo_full_path_no_format+"-resized.jpg") + + with open(photo_full_path_no_format+"-resized.jpg", "rb") as read_img: + input_encoded = base64.b64encode(read_img.read()).decode("utf8") + + + image_captions[image_file] = {"caption":invoke_models.generate_image_captions_llm(input_encoded, "What's in this image?"), + "encoding":input_encoded + } + print("image_processing done") + #print(image_captions) + + #print(os.path.join('figures',image_file)) + extracted_elements_list = [] + extracted_elements_list.append({ + 'source': target_file, + 'tables': tables, + 'texts': texts, + 'images': image_captions + }) + documents = [] + documents_mm = [] + for extracted_element in extracted_elements_list: + print("prepping data") + texts = extracted_element['texts'] + tables = extracted_element['tables'] + images_data = extracted_element['images'] + src_doc = extracted_element['source'] + for text in texts: + embedding = invoke_models.invoke_model(text) + document = prep_document(text,text,'text',src_doc,'none',embedding) + documents.append(document) + for table in tables: + table_raw = table['raw'] + + + table_summary = table['summary'] + embedding = invoke_models.invoke_model(table_summary) + + document = prep_document(table_raw,table_summary,'table*'+table['table_name'],src_doc,'none',embedding) + documents.append(document) + for file_name in images_data.keys(): + embedding = invoke_models.invoke_model_mm(image_captions[file_name]['caption'],image_captions[file_name]['encoding']) + document = prep_document(image_captions[file_name]['caption'],image_captions[file_name]['caption'],'image_'+file_name,src_doc,image_captions[file_name]['encoding'],embedding) + documents_mm.append(document) + + embedding = invoke_models.invoke_model(image_captions[file_name]['caption']) + document = prep_document(image_captions[file_name]['caption'],image_captions[file_name]['caption'],'image_'+file_name,src_doc,'none',embedding) + documents.append(document) + + + + os_ingest(index_, documents) + os_ingest_mm(index_, documents_mm) + +def prep_document(raw_element,processed_element,doc_type,src_doc,encoding,embedding): + if('image' in doc_type): + img_ = doc_type.split("_")[1] + else: + img_ = "None" + document = { + "processed_element": re.sub(r"[^a-zA-Z0-9]+", ' ', processed_element) , + "raw_element_type": doc_type.split("*")[0], + "raw_element": re.sub(r"[^a-zA-Z0-9]+", ' ', raw_element) , + "src_doc": src_doc.replace(","," "), + "image": img_, + + } + + if(encoding!="none"): + document["image_encoding"] = encoding + document["processed_element_embedding_bedrock-multimodal"] = embedding + else: + document["processed_element_embedding"] = embedding + + if('table' in doc_type): + document["table"] = doc_type.split("*")[1] + + return document + + + +def os_ingest(index_,documents): + print("ingesting data") + #host = 'your collection id.region.aoss.amazonaws.com' + if(ospy_client.indices.exists(index=index_)): + ospy_client.indices.delete(index = index_) + index_body = { + "settings": { + "index": { + "knn": True, + "default_pipeline": "rag-ingest-pipeline", + "number_of_shards": 4 + } + }, + "mappings": { + "properties": { + "processed_element": { + "type": "text" + }, + "raw_element": { + "type": "text" + }, + "processed_element_embedding": { + "type": "knn_vector", + "dimension":1536, + "method": { + "engine": "faiss", + "space_type": "l2", + "name": "hnsw", + "parameters": {} + } + }, + # "processed_element_embedding_bedrock-multimodal": { + # "type": "knn_vector", + # "dimension": 1024, + # "method": { + # "engine": "faiss", + # "space_type": "l2", + # "name": "hnsw", + # "parameters": {} + # } + # }, + # "image_encoding": { + # "type": "binary" + # }, + "raw_element_type": { + "type": "text" + }, + "processed_element_embedding_sparse": { + "type": "rank_features" + }, + "src_doc": { + "type": "text" + }, + "image":{ "type": "text"} + + } + } + } + response = ospy_client.indices.create(index_, body=index_body) + + for doc in documents: + print("----------doc------------") + if(doc['image']!='None'): + print("image insert") + print(doc['image']) + + response = ospy_client.index( + index = index_, + body = doc, + ) + + +def os_ingest_mm(index_,documents_mm): + #host = 'your collection id.region.aoss.amazonaws.com' + index_ = index_+"_mm" + if(ospy_client.indices.exists(index=index_)): + ospy_client.indices.delete(index = index_) + index_body = { + "settings": { + "index": { + "knn": True, + # "default_pipeline": "rag-ingest-pipeline", + "number_of_shards": 4 + } + }, + "mappings": { + "properties": { + "processed_element": { + "type": "text" + }, + "raw_element": { + "type": "text" + }, + + "processed_element_embedding_bedrock-multimodal": { + "type": "knn_vector", + "dimension": 1024, + "method": { + "engine": "faiss", + "space_type": "l2", + "name": "hnsw", + "parameters": {} + } + }, + "image_encoding": { + "type": "binary" + }, + "raw_element_type": { + "type": "text" + }, + + "src_doc": { + "type": "text" + }, + "image":{ "type": "text"} + + } + } + } + response = ospy_client.indices.create(index_, body=index_body) + + for doc in documents_mm: + #print("----------doc------------") + #print(doc) + + response = ospy_client.index( + index = index_, + body = doc, + ) + + + + diff --git a/RAG/rag_DocumentSearcher.py b/RAG/rag_DocumentSearcher.py new file mode 100644 index 0000000000000000000000000000000000000000..5bb041f04b9a224c003b2e1fe95c5e7bf3d91c3a --- /dev/null +++ b/RAG/rag_DocumentSearcher.py @@ -0,0 +1,338 @@ +import boto3 +import json +import os +import shutil +from unstructured.partition.pdf import partition_pdf +from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth +import streamlit as st +from PIL import Image +import base64 +import re +#from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer +import torch +import base64 +import requests +import utilities.re_ranker as re_ranker +import utilities.invoke_models as invoke_models +#import langchain +headers = {"Content-Type": "application/json"} +host = "https://search-opensearchservi-75ucark0bqob-bzk6r6h2t33dlnpgx2pdeg22gi.us-east-1.es.amazonaws.com/" + +parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) + +def query_(awsauth,inputs, session_id,search_types): + + print("using index: "+st.session_state.input_index) + + question = inputs['query'] + + k=1 + embedding = invoke_models.invoke_model_mm(question,"none") + + query_mm = { + "size": k, + "_source": { + "exclude": [ + "processed_element_embedding_bedrock-multimodal","processed_element_embedding_sparse","image_encoding","processed_element_embedding" + ] + }, + "query": { + "knn": { + "processed_element_embedding_bedrock-multimodal": { + "vector": embedding, + "k": k} + } + } + } + + path = st.session_state.input_index+"_mm/_search" + url = host+path + r = requests.get(url, auth=awsauth, json=query_mm, headers=headers) + response_mm = json.loads(r.text) + # response_mm = ospy_client.search( + # body = query_mm, + # index = st.session_state.input_index+"_mm" + # ) + + + + hits = response_mm['hits']['hits'] + context = [] + context_tables = [] + images = [] + + for hit in hits: + #context.append(hit['_source']['caption']) + images.append({'file':hit['_source']['image'],'caption':hit['_source']['processed_element']}) + + ####### SEARCH ######## + + + path = "_search/pipeline/rag-search-pipeline" + url = host + path + + num_queries = len(search_types) + + weights = [] + + searches = ['Keyword','Vector','NeuralSparse'] + equal_weight = (int(100/num_queries) )/100 + if(num_queries>1): + for index,search in enumerate(search_types): + + if(index != (num_queries-1)): + weight = equal_weight + else: + weight = 1-sum(weights) + + weights.append(weight) + + #print(weights) + + + s_pipeline_payload = { + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": weights + } + } + } + } + ] + } + + r = requests.put(url, auth=awsauth, json=s_pipeline_payload, headers=headers) + #print(r.status_code) + #print(r.text) + + + + SIZE = 5 + + hybrid_payload = { + "_source": { + "exclude": [ + "processed_element_embedding","processed_element_embedding_sparse" + ] + }, + "query": { + "hybrid": { + "queries": [ + + #1. keyword query + #2. vector search query + #3. Sparse query + + ] + } + },"size":SIZE, + } + + + + if('Keyword Search' in search_types): + + keyword_payload = { + "match": { + "processed_element": { + "query": question + } + } + } + + hybrid_payload["query"]["hybrid"]["queries"].append(keyword_payload) + + + + if('Vector Search' in search_types): + + embedding = embedding = invoke_models.invoke_model(question) + + vector_payload = { + "knn": { + "processed_element_embedding": { + "vector": embedding, + "k": 2} + } + } + + hybrid_payload["query"]["hybrid"]["queries"].append(vector_payload) + + if('Sparse Search' in search_types): + + #print("text expansion is enabled") + sparse_payload = { "neural_sparse": { + "processed_element_embedding_sparse": { + "query_text": question, + "model_id": "srrJ-owBQhe1aB-khx2n" + } + }} + + + hybrid_payload["query"]["hybrid"]["queries"].append(sparse_payload) + + # path2 = "_plugins/_ml/models/srrJ-owBQhe1aB-khx2n/_predict" + # url2 = host+path2 + # payload2 = { + # "parameters": { + # "inputs": question + # } + # } + # r2 = requests.post(url2, auth=awsauth, json=payload2, headers=headers) + # sparse_ = json.loads(r2.text) + # query_sparse = sparse_["inference_results"][0]["output"][0]["dataAsMap"]["response"][0] + + + + + + # print("hybrid_payload") + # print("---------------") + #print(hybrid_payload) + hits = [] + if(num_queries>1): + path = st.session_state.input_index+"/_search?search_pipeline=rag-search-pipeline" + else: + path = st.session_state.input_index+"/_search" + url = host+path + if(len(hybrid_payload["query"]["hybrid"]["queries"])==1): + single_query = hybrid_payload["query"]["hybrid"]["queries"][0] + del hybrid_payload["query"]["hybrid"] + hybrid_payload["query"] = single_query + r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers) + #print(r.status_code) + response_ = json.loads(r.text) + #print("-------------------------------------------------------------------") + #print(r.text) + hits = response_['hits']['hits'] + + else: + r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers) + #print(r.status_code) + response_ = json.loads(r.text) + #print("-------------------------------------------------------------------") + #print(response_) + hits = response_['hits']['hits'] + + ##### GET reference tables separately like *_mm index search for images ###### + def lazy_get_table(): + #print("Forcing table analysis") + table_ref = [] + any_table_exists = False + for fname in os.listdir(parent_dirname+"/split_pdf_csv"): + if fname.startswith(st.session_state.input_index): + any_table_exists = True + break + if(any_table_exists): + #################### Basic Match query ################# + # payload_tables = { + # "query": { + # "bool":{ + + # "must":{"match": { + # "processed_element": question + + # }}, + + # "filter":{"term":{"raw_element_type": "table"}} + + + # }}} + + #################### Neural Sparse query ################# + payload_tables = {"query":{"neural_sparse": { + "processed_element_embedding_sparse": { + "query_text": question, + "model_id": "srrJ-owBQhe1aB-khx2n" + } + } } } + + + r_ = requests.get(url, auth=awsauth, json=payload_tables, headers=headers) + r_tables = json.loads(r_.text) + + for res_ in r_tables['hits']['hits']: + if(res_["_source"]['raw_element_type'] == 'table'): + table_ref.append({'name':res_["_source"]['table'],'text':res_["_source"]['processed_element']}) + if(len(table_ref) == 2): + break + + + return table_ref + + + ########################### LLM Generation ######################## + prompt_template = """ + The following is a friendly conversation between a human and an AI. + The AI is talkative and provides lots of specific details from its context. + {context} + Instruction: Based on the above documents, provide a detailed answer for, {question}. Answer "don't know", + if not present in the context. + Solution:""" + + + + idx = 0 + images_2 = [] + is_table_in_result = False + df = [] + for hit in hits[0:3]: + + + if(hit["_source"]["raw_element_type"] == 'table'): + #print("Need to analyse table") + is_table_in_result = True + table_res = invoke_models.read_from_table(hit["_source"]["table"],question) + df.append({'name':hit["_source"]["table"],'text':hit["_source"]["processed_element"]}) + context_tables.append(table_res+"\n\n"+hit["_source"]["processed_element"]) + + else: + if(hit["_source"]["image"]!="None"): + with open(parent_dirname+'/figures/'+st.session_state.input_index+"/"+hit["_source"]["raw_element_type"].split("_")[1].replace(".jpg","")+"-resized.jpg", "rb") as read_img: + input_encoded = base64.b64encode(read_img.read()).decode("utf8") + context.append(invoke_models.generate_image_captions_llm(input_encoded,question)) + else: + context.append(hit["_source"]["processed_element"]) + + if(hit["_source"]["image"]!="None"): + images_2.append({'file':hit["_source"]["image"],'caption':hit["_source"]["processed_element"]}) + + idx = idx +1 + #images.append(hit['_source']['image']) + + # if(is_table_in_result == False): + # df = lazy_get_table() + # print("forcefully selected top 2 tables") + # print(df) + + # for pos,table in enumerate(df): + # table_res = invoke_models.read_from_table(table['name'],question) + # context_tables.append(table_res)#+"\n\n"+table['text'] + + + total_context = context_tables + context + + ####### Re-Rank ######## + + #print("re-rank") + + if(st.session_state.input_is_rerank == True and len(total_context)): + ques = [{"question":question}] + ans = [{"answer":total_context}] + + total_context = re_ranker.re_rank('rag','Cross Encoder',"",ques, ans) + + llm_prompt = prompt_template.format(context=total_context[0],question=question) + output = invoke_models.invoke_llm_model( "\n\nHuman: {input}\n\nAssistant:".format(input=llm_prompt) ,False) + #print(output) + if(len(images_2)==0): + images_2 = images + return {'text':output,'source':total_context,'image':images_2,'table':df} diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e7ae0821601e7b94679a0557144fcde59fe1b668 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +--- +title: OpenSearch AI +emoji: 🔍 +colorFrom: pink +colorTo: purple +sdk: streamlit +sdk_version: 1.41.1 +app_file: app.py +pinned: false +license: apache-2.0 +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..24acb1e33f92122958239caa6fe77d96d1a2ebd0 --- /dev/null +++ b/app.py @@ -0,0 +1,125 @@ +import streamlit as st +from PIL import Image +import base64 +import yaml +import os +import urllib.request +import tarfile +import subprocess +from yaml.loader import SafeLoader + + +st.set_page_config( + + #page_title="Semantic Search using OpenSearch", + layout="wide", + page_icon="/home/ubuntu/images/opensearch_mark_default.png" +) + +st.markdown(""" + """,unsafe_allow_html=True) + +# with open('/home/ubuntu/AI-search-with-amazon-opensearch-service/OpenSearchApp/auth.yaml') as file: +# config = yaml.load(file, Loader=SafeLoader) +# authenticator = Authenticate( +# config['credentials'], +# config['cookie']['name'], +# config['cookie']['key'], +# config['cookie']['expiry_days'], +# config['preauthorized'] +# ) +# name, authentication_status, username = authenticator.login('Login', 'main') + + +AI_ICON = "images/opensearch-twitter-card.png" +col_0_1,col_0_2,col_0_3= st.columns([10,50,85]) +with col_0_1: + st.image(AI_ICON, use_container_width='always') +with col_0_2: + st.markdown('

OpenSearch AI demos

',unsafe_allow_html=True) + #st.header("OpenSearch AI demos")#,divider = 'rainbow' +# with col_0_3: +# st.markdown("Workshop",unsafe_allow_html=True) + + +#st.header(":rewind: Demos available") +st.write("") +#st.write("----") +#st.write("Choose a demo") +st.write("") +col_1_1,col_1_2,col_1_3 = st.columns([3,40,65]) +with col_1_1: + st.subheader(" ") +with col_1_2: + st.markdown('

Neural Search

',unsafe_allow_html=True) +with col_1_3: + demo_1 = st.button(":arrow_forward:",key = "demo_1") +if(demo_1): + st.switch_page('pages/Semantic_Search.py') +st.write("") +#st.page_link("pages/1_Semantic_Search.py", label=":orange[1. Semantic Search] :arrow_forward:") +#st.button("1. Semantic Search") +# image_ = Image.open('/home/ubuntu/images/Semantic_SEarch.png') +# new_image = image_.resize((1500, 1000)) +# new_image.save('images/semantic_search_resize.png') +# st.image("images/semantic_search_resize.png") +col_2_1,col_2_2,col_2_3 = st.columns([3,40,65]) +with col_2_1: + st.subheader(" ") +with col_2_2: + st.markdown('

Multimodal Conversational Search

',unsafe_allow_html=True) + +with col_2_3: + demo_2 = st.button(":arrow_forward:",key = "demo_2") +if(demo_2): + st.switch_page('pages/Multimodal_Conversational_Search.py') +st.write("") +#st.header("2. Multimodal Conversational Search") +# image_ = Image.open('images/RAG_.png') +# new_image = image_.resize((1500, 1000)) +# new_image.save('images/RAG_resize.png') +# st.image("images/RAG_resize.png") + +col_3_1,col_3_2,col_3_3 = st.columns([3,40,65]) +with col_3_1: + st.subheader(" ") +with col_3_2: + st.markdown('
Agentic Shopping Assistant
',unsafe_allow_html=True)#New +with col_3_3: + demo_3 = st.button(":arrow_forward:",key = "demo_3") +if(demo_3): + st.switch_page('pages/AI_Shopping_Assistant.py') +# with st.sidebar: +# st.subheader("Choose a demo !") + + + + + # """ + # + # """, + +isExist = os.path.exists("/home/user/images_retail") +if not isExist: + os.makedirs("/home/user/images_retail") + metadata_file = urllib.request.urlretrieve('https://aws-blogs-artifacts-public.s3.amazonaws.com/BDB-3144/products-data.yml', '/home/user/products.yaml') + img_filename,headers= urllib.request.urlretrieve('https://aws-blogs-artifacts-public.s3.amazonaws.com/BDB-3144/images.tar.gz', '/home/user/images_retail/images.tar.gz') + print(img_filename) + file = tarfile.open('/home/user/images_retail/images.tar.gz') + file.extractall('/home/user/images_retail/') + file.close() + #remove images.tar.gz + os.remove('/home/user/images_retail/images.tar.gz') \ No newline at end of file diff --git a/figures/ukhousingstats/figure-1-1-resized.jpg b/figures/ukhousingstats/figure-1-1-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..999c7f78b0bad5c64ace806bd3c8797591d45953 Binary files /dev/null and b/figures/ukhousingstats/figure-1-1-resized.jpg differ diff --git a/figures/ukhousingstats/figure-1-1.jpg b/figures/ukhousingstats/figure-1-1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6cdc6f5e2fe543f3413a6b69b147dd1e797ec533 Binary files /dev/null and b/figures/ukhousingstats/figure-1-1.jpg differ diff --git a/figures/ukhousingstats/figure-1-2-resized.jpg b/figures/ukhousingstats/figure-1-2-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f1dbd41d79a2c4aadf80145644d17eec097ff84e Binary files /dev/null and b/figures/ukhousingstats/figure-1-2-resized.jpg differ diff --git a/figures/ukhousingstats/figure-1-2.jpg b/figures/ukhousingstats/figure-1-2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e5b8765b2ad1a3f35494ea3c07fa62a24d180466 Binary files /dev/null and b/figures/ukhousingstats/figure-1-2.jpg differ diff --git a/figures/ukhousingstats/figure-2-3-resized.jpg b/figures/ukhousingstats/figure-2-3-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ca50e5436622b95638ed9518707089ee73c79661 Binary files /dev/null and b/figures/ukhousingstats/figure-2-3-resized.jpg differ diff --git a/figures/ukhousingstats/figure-2-3.jpg b/figures/ukhousingstats/figure-2-3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e255dc675f718d667681ca6b95069869bead5b1c Binary files /dev/null and b/figures/ukhousingstats/figure-2-3.jpg differ diff --git a/figures/ukhousingstats/figure-3-4-resized.jpg b/figures/ukhousingstats/figure-3-4-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..033e5e7c39444238d0ad20f2742d44990e0a6d81 Binary files /dev/null and b/figures/ukhousingstats/figure-3-4-resized.jpg differ diff --git a/figures/ukhousingstats/figure-3-4.jpg b/figures/ukhousingstats/figure-3-4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b1c62c2f8e16c2fea3cc42a5df37cc23b0086394 Binary files /dev/null and b/figures/ukhousingstats/figure-3-4.jpg differ diff --git a/figures/ukhousingstats/figure-3-5-resized.jpg b/figures/ukhousingstats/figure-3-5-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bb84fc2a15e64bec06d8a01ec027982f180da1db Binary files /dev/null and b/figures/ukhousingstats/figure-3-5-resized.jpg differ diff --git a/figures/ukhousingstats/figure-3-5.jpg b/figures/ukhousingstats/figure-3-5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8064373554d553a115ce0071ebdf5da40d3b544f Binary files /dev/null and b/figures/ukhousingstats/figure-3-5.jpg differ diff --git a/figures/ukhousingstats/figure-4-6-resized.jpg b/figures/ukhousingstats/figure-4-6-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fcefd41c86c517ed3c0537c684f86894c023169e Binary files /dev/null and b/figures/ukhousingstats/figure-4-6-resized.jpg differ diff --git a/figures/ukhousingstats/figure-4-6.jpg b/figures/ukhousingstats/figure-4-6.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4d6257cd850b364119ee1a9cc92191afc2929a66 Binary files /dev/null and b/figures/ukhousingstats/figure-4-6.jpg differ diff --git a/figures/ukhousingstats/figure-4-7-resized.jpg b/figures/ukhousingstats/figure-4-7-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..59e2d8b02017d9739ad08d61f0ba299c178e7bc5 Binary files /dev/null and b/figures/ukhousingstats/figure-4-7-resized.jpg differ diff --git a/figures/ukhousingstats/figure-4-7.jpg b/figures/ukhousingstats/figure-4-7.jpg new file mode 100644 index 0000000000000000000000000000000000000000..38dd868fe273fa9ee8a463bb3eaaa4d38fd2b9f8 Binary files /dev/null and b/figures/ukhousingstats/figure-4-7.jpg differ diff --git a/figures/ukhousingstats/figure-5-8-resized.jpg b/figures/ukhousingstats/figure-5-8-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..62c97f46cf866a2ecf075b186a6ad1c44c8ac72d Binary files /dev/null and b/figures/ukhousingstats/figure-5-8-resized.jpg differ diff --git a/figures/ukhousingstats/figure-5-8.jpg b/figures/ukhousingstats/figure-5-8.jpg new file mode 100644 index 0000000000000000000000000000000000000000..86752f784ce71769dd4a91abf99643632dad47aa Binary files /dev/null and b/figures/ukhousingstats/figure-5-8.jpg differ diff --git a/figures/ukhousingstats/figure-6-10-resized.jpg b/figures/ukhousingstats/figure-6-10-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..62160baf885ab3907f45272646747b0f3afedf6d Binary files /dev/null and b/figures/ukhousingstats/figure-6-10-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-10.jpg b/figures/ukhousingstats/figure-6-10.jpg new file mode 100644 index 0000000000000000000000000000000000000000..146a72245e84943480ba59704ac11ae765e8af35 Binary files /dev/null and b/figures/ukhousingstats/figure-6-10.jpg differ diff --git a/figures/ukhousingstats/figure-6-11-resized.jpg b/figures/ukhousingstats/figure-6-11-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6519d04f558c3110a2cb8bc29335b18b022be531 Binary files /dev/null and b/figures/ukhousingstats/figure-6-11-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-11.jpg b/figures/ukhousingstats/figure-6-11.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7260c5a08f0650b7bfe82c89afbb1773bc2a0895 Binary files /dev/null and b/figures/ukhousingstats/figure-6-11.jpg differ diff --git a/figures/ukhousingstats/figure-6-12-resized.jpg b/figures/ukhousingstats/figure-6-12-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a10ea4b964eaf1300ffa89713614e046910a5185 Binary files /dev/null and b/figures/ukhousingstats/figure-6-12-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-12.jpg b/figures/ukhousingstats/figure-6-12.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c8fcd798ed826eb26b120ecccdc07b957d7487c5 Binary files /dev/null and b/figures/ukhousingstats/figure-6-12.jpg differ diff --git a/figures/ukhousingstats/figure-6-13-resized.jpg b/figures/ukhousingstats/figure-6-13-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ab65f371eff6d1043e0033e9a5e6fe29a36cc26d Binary files /dev/null and b/figures/ukhousingstats/figure-6-13-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-13.jpg b/figures/ukhousingstats/figure-6-13.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0ffd17c51a6eed07bae78e5ae70de63b5a6cf400 Binary files /dev/null and b/figures/ukhousingstats/figure-6-13.jpg differ diff --git a/figures/ukhousingstats/figure-6-14-resized.jpg b/figures/ukhousingstats/figure-6-14-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..49ef34c93e28731bb1b99751c93bf9ffbab13c1e Binary files /dev/null and b/figures/ukhousingstats/figure-6-14-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-14.jpg b/figures/ukhousingstats/figure-6-14.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a3c1a68b2eb0e512cad53dad6a74c361b09489d9 Binary files /dev/null and b/figures/ukhousingstats/figure-6-14.jpg differ diff --git a/figures/ukhousingstats/figure-6-15-resized.jpg b/figures/ukhousingstats/figure-6-15-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..44143c0ac0eeaa285d35e2a8da7b3f70ae5c044e Binary files /dev/null and b/figures/ukhousingstats/figure-6-15-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-15.jpg b/figures/ukhousingstats/figure-6-15.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c90ade236bfc328b4c7e512c978f105d8bb7dceb Binary files /dev/null and b/figures/ukhousingstats/figure-6-15.jpg differ diff --git a/figures/ukhousingstats/figure-6-16-resized.jpg b/figures/ukhousingstats/figure-6-16-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..33d05624fb2e3c2b868269090cb64c390e71668f Binary files /dev/null and b/figures/ukhousingstats/figure-6-16-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-16.jpg b/figures/ukhousingstats/figure-6-16.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e79c46d5d90feb17fa52fc2c4fc93c7e55de281c Binary files /dev/null and b/figures/ukhousingstats/figure-6-16.jpg differ diff --git a/figures/ukhousingstats/figure-6-17-resized.jpg b/figures/ukhousingstats/figure-6-17-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..54833f5061afda0b21fe1e12d923c004f865c1dc Binary files /dev/null and b/figures/ukhousingstats/figure-6-17-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-17.jpg b/figures/ukhousingstats/figure-6-17.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c796d7107502eab757aafa991cfc38b02dbc4f2f Binary files /dev/null and b/figures/ukhousingstats/figure-6-17.jpg differ diff --git a/figures/ukhousingstats/figure-6-18-resized.jpg b/figures/ukhousingstats/figure-6-18-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..db29de16d3cf205774e5639c6352b5a402037ea9 Binary files /dev/null and b/figures/ukhousingstats/figure-6-18-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-18.jpg b/figures/ukhousingstats/figure-6-18.jpg new file mode 100644 index 0000000000000000000000000000000000000000..954cd6e0481ba85479e9fb915f12b17431e80666 Binary files /dev/null and b/figures/ukhousingstats/figure-6-18.jpg differ diff --git a/figures/ukhousingstats/figure-6-19-resized.jpg b/figures/ukhousingstats/figure-6-19-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..62cb929d3d7d87b26a05e6224114b44212e306d3 Binary files /dev/null and b/figures/ukhousingstats/figure-6-19-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-19.jpg b/figures/ukhousingstats/figure-6-19.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2a2a7d1c7968079ac48144dd279cd637b5a74262 Binary files /dev/null and b/figures/ukhousingstats/figure-6-19.jpg differ diff --git a/figures/ukhousingstats/figure-6-20-resized.jpg b/figures/ukhousingstats/figure-6-20-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ad9c48391fca4a6297e198d03e4f807228a022e1 Binary files /dev/null and b/figures/ukhousingstats/figure-6-20-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-20.jpg b/figures/ukhousingstats/figure-6-20.jpg new file mode 100644 index 0000000000000000000000000000000000000000..aa48f537e7a8a3cf1adca6a97d453ba659b44410 Binary files /dev/null and b/figures/ukhousingstats/figure-6-20.jpg differ diff --git a/figures/ukhousingstats/figure-6-21-resized.jpg b/figures/ukhousingstats/figure-6-21-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f57264a9fa12c3ed05bfd58c1b3b566e402b4c5d Binary files /dev/null and b/figures/ukhousingstats/figure-6-21-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-21.jpg b/figures/ukhousingstats/figure-6-21.jpg new file mode 100644 index 0000000000000000000000000000000000000000..34976c7b0f552cfb0044630453f35304ae697756 Binary files /dev/null and b/figures/ukhousingstats/figure-6-21.jpg differ diff --git a/figures/ukhousingstats/figure-6-22-resized.jpg b/figures/ukhousingstats/figure-6-22-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f2e613ebdc05ff5f74a250de1d7c5bf1874d157c Binary files /dev/null and b/figures/ukhousingstats/figure-6-22-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-22.jpg b/figures/ukhousingstats/figure-6-22.jpg new file mode 100644 index 0000000000000000000000000000000000000000..335fdd291960a53dfe4a3169f992683e09abea08 Binary files /dev/null and b/figures/ukhousingstats/figure-6-22.jpg differ diff --git a/figures/ukhousingstats/figure-6-23-resized.jpg b/figures/ukhousingstats/figure-6-23-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5a44d3d52db6a6c1a4f9d4d13429acae9db335fc Binary files /dev/null and b/figures/ukhousingstats/figure-6-23-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-23.jpg b/figures/ukhousingstats/figure-6-23.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f606a425b3814241eaf23181b687ddb7833d3c5b Binary files /dev/null and b/figures/ukhousingstats/figure-6-23.jpg differ diff --git a/figures/ukhousingstats/figure-6-24-resized.jpg b/figures/ukhousingstats/figure-6-24-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fc6a090bd7afd8d8b3a555fb37a69055597ace2a Binary files /dev/null and b/figures/ukhousingstats/figure-6-24-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-24.jpg b/figures/ukhousingstats/figure-6-24.jpg new file mode 100644 index 0000000000000000000000000000000000000000..969344334e3748771843e06d8ecf5f4d4c209de4 Binary files /dev/null and b/figures/ukhousingstats/figure-6-24.jpg differ diff --git a/figures/ukhousingstats/figure-6-25-resized.jpg b/figures/ukhousingstats/figure-6-25-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..07013181d8900c3a1793db136a1e279d3f6e1993 Binary files /dev/null and b/figures/ukhousingstats/figure-6-25-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-25.jpg b/figures/ukhousingstats/figure-6-25.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fd205e8752b63c642ceed6d7588b1c322335fe76 Binary files /dev/null and b/figures/ukhousingstats/figure-6-25.jpg differ diff --git a/figures/ukhousingstats/figure-6-26-resized.jpg b/figures/ukhousingstats/figure-6-26-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a33e1768418c06f846a20ad2f97f0d93ac1f4110 Binary files /dev/null and b/figures/ukhousingstats/figure-6-26-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-26.jpg b/figures/ukhousingstats/figure-6-26.jpg new file mode 100644 index 0000000000000000000000000000000000000000..316933dc5fc1e6a1316fe8e52eacf0d649cc3fd4 Binary files /dev/null and b/figures/ukhousingstats/figure-6-26.jpg differ diff --git a/figures/ukhousingstats/figure-6-27-resized.jpg b/figures/ukhousingstats/figure-6-27-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..705d7a9a2821cf6e3e47dd017a7824bca8eb4622 Binary files /dev/null and b/figures/ukhousingstats/figure-6-27-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-27.jpg b/figures/ukhousingstats/figure-6-27.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7b901050551b8dbf3a7856e665e6cafe0033cc04 Binary files /dev/null and b/figures/ukhousingstats/figure-6-27.jpg differ diff --git a/figures/ukhousingstats/figure-6-28-resized.jpg b/figures/ukhousingstats/figure-6-28-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b491ba51b87c7450094024471061b822959ed357 Binary files /dev/null and b/figures/ukhousingstats/figure-6-28-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-28.jpg b/figures/ukhousingstats/figure-6-28.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9af90aa03bb0883dce5f12fcee00a08e93f9036f Binary files /dev/null and b/figures/ukhousingstats/figure-6-28.jpg differ diff --git a/figures/ukhousingstats/figure-6-29-resized.jpg b/figures/ukhousingstats/figure-6-29-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..32ad259988bd4391c4e887c5c859213ad783536a Binary files /dev/null and b/figures/ukhousingstats/figure-6-29-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-29.jpg b/figures/ukhousingstats/figure-6-29.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bdc932ea8dcd33cd47f8727d1439837ae8595827 Binary files /dev/null and b/figures/ukhousingstats/figure-6-29.jpg differ diff --git a/figures/ukhousingstats/figure-6-30-resized.jpg b/figures/ukhousingstats/figure-6-30-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7948fb1044b4a212e267984a5752d3c5649bda5e Binary files /dev/null and b/figures/ukhousingstats/figure-6-30-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-30.jpg b/figures/ukhousingstats/figure-6-30.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6a75c1047a6e05352a3c424e3aaefd0a359546ea Binary files /dev/null and b/figures/ukhousingstats/figure-6-30.jpg differ diff --git a/figures/ukhousingstats/figure-6-31-resized.jpg b/figures/ukhousingstats/figure-6-31-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c4891c347348884c76c4156bfea8dbe13b17df16 Binary files /dev/null and b/figures/ukhousingstats/figure-6-31-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-31.jpg b/figures/ukhousingstats/figure-6-31.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f2cf05b164d7042223e946946143f68c7c7b4ce1 Binary files /dev/null and b/figures/ukhousingstats/figure-6-31.jpg differ diff --git a/figures/ukhousingstats/figure-6-9-resized.jpg b/figures/ukhousingstats/figure-6-9-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..363ec4f7d2e3fb6847d11fff8c37b8a57aa19aed Binary files /dev/null and b/figures/ukhousingstats/figure-6-9-resized.jpg differ diff --git a/figures/ukhousingstats/figure-6-9.jpg b/figures/ukhousingstats/figure-6-9.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1700547b4602c0997414cd41bd2bf3959f22d692 Binary files /dev/null and b/figures/ukhousingstats/figure-6-9.jpg differ diff --git a/images/AI.png b/images/AI.png new file mode 100755 index 0000000000000000000000000000000000000000..bbfdec04aec01c4adeb70342ef129727aebec097 Binary files /dev/null and b/images/AI.png differ diff --git a/images/Github-symbol.png b/images/Github-symbol.png new file mode 100755 index 0000000000000000000000000000000000000000..065bf4aa941b80958b9cbed650c6a2ccf9674494 Binary files /dev/null and b/images/Github-symbol.png differ diff --git a/images/Image_Icon.png b/images/Image_Icon.png new file mode 100755 index 0000000000000000000000000000000000000000..dbeef93cb74bdd3ff367f8d4a8b426f2c91c5afb Binary files /dev/null and b/images/Image_Icon.png differ diff --git a/images/RAG.png b/images/RAG.png new file mode 100755 index 0000000000000000000000000000000000000000..f3de22547973c5709d433b356428f2702689a280 Binary files /dev/null and b/images/RAG.png differ diff --git a/images/RAG_.png b/images/RAG_.png new file mode 100755 index 0000000000000000000000000000000000000000..5a5aa3dd74a5c22ab41c5ce1ddedc865a8eeea36 Binary files /dev/null and b/images/RAG_.png differ diff --git a/images/RAG_resize.png b/images/RAG_resize.png new file mode 100755 index 0000000000000000000000000000000000000000..907b2c00c01d7a31ebeef4f411f23bbe6d199b87 Binary files /dev/null and b/images/RAG_resize.png differ diff --git a/images/Semantic_SEarch.png b/images/Semantic_SEarch.png new file mode 100755 index 0000000000000000000000000000000000000000..c1198f0ea1589daa8fecfc3b499937f54042e632 Binary files /dev/null and b/images/Semantic_SEarch.png differ diff --git a/images/ai-icon.png b/images/ai-icon.png new file mode 100755 index 0000000000000000000000000000000000000000..e9623dd8798dc2c1260880a2314c2e84de03ab71 Binary files /dev/null and b/images/ai-icon.png differ diff --git a/images/opensearch-twitter-card.png b/images/opensearch-twitter-card.png new file mode 100755 index 0000000000000000000000000000000000000000..077f1868dee49c81e269c88206333fd267170d8e Binary files /dev/null and b/images/opensearch-twitter-card.png differ diff --git a/images/opensearch_logo.png b/images/opensearch_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..93f932fa9ad9072ccb50977f3041d3f60096c87e Binary files /dev/null and b/images/opensearch_logo.png differ diff --git a/images/opensearch_mark_default.png b/images/opensearch_mark_default.png new file mode 100755 index 0000000000000000000000000000000000000000..77ea00b50d5327b947c97b88728b69ccdb6b6e3e Binary files /dev/null and b/images/opensearch_mark_default.png differ diff --git a/images/quip.png b/images/quip.png new file mode 100755 index 0000000000000000000000000000000000000000..2484c14b2ceabba2d931dd9054a10f39d026baa6 Binary files /dev/null and b/images/quip.png differ diff --git a/images/regenerate.png b/images/regenerate.png new file mode 100755 index 0000000000000000000000000000000000000000..34dc5be6135f602bfef8497ea1c79a26a21ffd11 Binary files /dev/null and b/images/regenerate.png differ diff --git a/images/search_types.png b/images/search_types.png new file mode 100755 index 0000000000000000000000000000000000000000..89de30a83af4c2ceb3c722ee838ea2e2bee9ff0a Binary files /dev/null and b/images/search_types.png differ diff --git a/images/search_types_resize.png b/images/search_types_resize.png new file mode 100755 index 0000000000000000000000000000000000000000..7f411f5e8885ac32d994e8efd562646d72a9cfe1 Binary files /dev/null and b/images/search_types_resize.png differ diff --git a/images/semantic_search.png b/images/semantic_search.png new file mode 100755 index 0000000000000000000000000000000000000000..916c0aabb57ee56ec059917f47e52829e471adc8 Binary files /dev/null and b/images/semantic_search.png differ diff --git a/images/semantic_search_resize.png b/images/semantic_search_resize.png new file mode 100755 index 0000000000000000000000000000000000000000..2a86911b82e3a6f4bc7305a2c9a30b032706ef28 Binary files /dev/null and b/images/semantic_search_resize.png differ diff --git a/images/service_logo.png b/images/service_logo.png new file mode 100755 index 0000000000000000000000000000000000000000..dc7486ea03ba8b1ad36ffd37dec3d9c81fa1c349 Binary files /dev/null and b/images/service_logo.png differ diff --git a/images/text.png b/images/text.png new file mode 100755 index 0000000000000000000000000000000000000000..383f4c2abffe6d66fb26a10734854e871a780f51 Binary files /dev/null and b/images/text.png differ diff --git a/images/text_icon.png b/images/text_icon.png new file mode 100755 index 0000000000000000000000000000000000000000..1574defc0e401ad38d88a91ef89698ea055978ac Binary files /dev/null and b/images/text_icon.png differ diff --git a/images/user-icon.png b/images/user-icon.png new file mode 100755 index 0000000000000000000000000000000000000000..f48176e9f75e5f7a7ac1d0786d8254944bb9b5ae Binary files /dev/null and b/images/user-icon.png differ diff --git a/images/user.png b/images/user.png new file mode 100755 index 0000000000000000000000000000000000000000..930c8968093f37274409173e602440ea24ecf357 Binary files /dev/null and b/images/user.png differ diff --git a/pages/AI_Shopping_Assistant.py b/pages/AI_Shopping_Assistant.py new file mode 100644 index 0000000000000000000000000000000000000000..932b2d20d581247bc5dbfdc9c07c437c8dc176f4 --- /dev/null +++ b/pages/AI_Shopping_Assistant.py @@ -0,0 +1,648 @@ +import streamlit as st +import uuid +import os +import re +import sys +import uuid +from io import BytesIO +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/semantic_search") +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/RAG") +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities") +import boto3 +import requests +from boto3 import Session +import botocore.session +import json +import random +import string +# import rag_DocumentLoader +# import rag_DocumentSearcher +import pandas as pd +from PIL import Image +import shutil +import base64 +import time +import botocore +#from langchain.callbacks.base import BaseCallbackHandler +#import streamlit_nested_layout +#from IPython.display import clear_output, display, display_markdown, Markdown +from requests_aws4auth import AWS4Auth +#import copali +from requests.auth import HTTPBasicAuth +import bedrock_agent + + + +st.set_page_config( + #page_title="Semantic Search using OpenSearch", + layout="wide", + page_icon="images/opensearch_mark_default.png" +) +parent_dirname = '/home/ubuntu/AI-search-with-amazon-opensearch-service/OpenSearchApp' +USER_ICON = "images/user.png" +AI_ICON = "images/opensearch-twitter-card.png" +REGENERATE_ICON = "images/regenerate.png" +s3_bucket_ = "pdf-repo-uploads" + #"pdf-repo-uploads" +polly_client = boto3.Session( + region_name='us-east-1').client('polly') + +# Check if the user ID is already stored in the session state +if 'user_id' in st.session_state: + user_id = st.session_state['user_id'] + #print(f"User ID: {user_id}") + +# If the user ID is not yet stored in the session state, generate a random UUID +else: + user_id = str(uuid.uuid4()) + st.session_state['user_id'] = user_id + + +if 'session_id_' not in st.session_state: + st.session_state['session_id_'] = str(uuid.uuid1()) + +if "chats" not in st.session_state: + st.session_state.chats = [ + { + 'id': 0, + 'question': '', + 'answer': '' + } + ] + +if "questions__" not in st.session_state: + st.session_state.questions__ = [] + +if "answers__" not in st.session_state: + st.session_state.answers__ = [] + +if "input_index" not in st.session_state: + st.session_state.input_index = "hpijan2024hometrack"#"globalwarmingnew"#"hpijan2024hometrack_no_img_no_table" + +if "input_is_rerank" not in st.session_state: + st.session_state.input_is_rerank = True + +if "input_copali_rerank" not in st.session_state: + st.session_state.input_copali_rerank = False + +if "input_table_with_sql" not in st.session_state: + st.session_state.input_table_with_sql = False + + +if "inputs_" not in st.session_state: + st.session_state.inputs_ = {} + +if "input_shopping_query" not in st.session_state: + st.session_state.input_shopping_query="get me shoes suitable for trekking"#"What is the projected energy percentage from renewable sources in future?"#"Which city in United Kingdom has the highest average housing price ?"#"How many aged above 85 years died due to covid ?"# What is the projected energy from renewable sources ?" + + +if "input_rag_searchType" not in st.session_state: + st.session_state.input_rag_searchType = ["Sparse Search"] + + + + +region = 'us-east-1' +#bedrock_runtime_client = boto3.client('bedrock-runtime',region_name=region) +output = [] +service = 'es' + +st.markdown(""" + + """,unsafe_allow_html=True) + +################ OpenSearch Py client ##################### + +# credentials = boto3.Session().get_credentials() +# awsauth = AWSV4SignerAuth(credentials, region, service) + +# ospy_client = OpenSearch( +# hosts = [{'host': 'search-opensearchservi-75ucark0bqob-bzk6r6h2t33dlnpgx2pdeg22gi.us-east-1.es.amazonaws.com', 'port': 443}], +# http_auth = awsauth, +# use_ssl = True, +# verify_certs = True, +# connection_class = RequestsHttpConnection, +# pool_maxsize = 20 +# ) + +################# using boto3 credentials ################### + + +# credentials = boto3.Session().get_credentials() +# awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) +# service = 'es' + + +################# using boto3 credentials #################### + + + +# if "input_searchType" not in st.session_state: +# st.session_state.input_searchType = "Conversational Search (RAG)" + +# if "input_temperature" not in st.session_state: +# st.session_state.input_temperature = "0.001" + +# if "input_topK" not in st.session_state: +# st.session_state.input_topK = 200 + +# if "input_topP" not in st.session_state: +# st.session_state.input_topP = 0.95 + +# if "input_maxTokens" not in st.session_state: +# st.session_state.input_maxTokens = 1024 + + +def write_logo(): + col1, col2, col3 = st.columns([5, 1, 5]) + with col2: + st.image(AI_ICON, use_container_width='always') + +def write_top_bar(): + col1, col2 = st.columns([77,23]) + with col1: + st.page_link("app.py", label=":orange[Home]", icon="🏠") + st.header("AI Shopping assistant",divider='rainbow') + + #st.image(AI_ICON, use_container_width='always') + + with col2: + st.write("") + st.write("") + clear = st.button("Clear") + st.write("") + st.write("") + return clear + +clear = write_top_bar() + +if clear: + st.session_state.questions__ = [] + st.session_state.answers__ = [] + st.session_state.input_shopping_query="" + st.session_state.session_id_ = str(uuid.uuid1()) + bedrock_agent.delete_memory() + # st.session_state.input_searchType="Conversational Search (RAG)" + # st.session_state.input_temperature = "0.001" + # st.session_state.input_topK = 200 + # st.session_state.input_topP = 0.95 + # st.session_state.input_maxTokens = 1024 + + +def handle_input(): + print("Question: "+st.session_state.input_shopping_query) + print("-----------") + print("\n\n") + if(st.session_state.input_shopping_query==''): + return "" + inputs = {} + for key in st.session_state: + if key.startswith('input_'): + inputs[key.removeprefix('input_')] = st.session_state[key] + st.session_state.inputs_ = inputs + + ####### + + + #st.write(inputs) + question_with_id = { + 'question': inputs["shopping_query"], + 'id': len(st.session_state.questions__) + } + st.session_state.questions__.append(question_with_id) + print(inputs) + out_ = bedrock_agent.query_(inputs) + st.session_state.answers__.append({ + 'answer': out_['text'], + 'source':out_['source'], + 'last_tool':out_['last_tool'], + 'id': len(st.session_state.questions__) + + + }) + st.session_state.input_shopping_query="" + + + +# search_type = st.selectbox('Select the Search type', +# ('Conversational Search (RAG)', +# 'OpenSearch vector search', +# 'LLM Text Generation' +# ), + +# key = 'input_searchType', +# help = "Select the type of retriever\n1. Conversational Search (Recommended) - This will include both the OpenSearch and LLM in the retrieval pipeline \n (note: This will put opensearch response as context to LLM to answer) \n2. OpenSearch vector search - This will put only OpenSearch's vector search in the pipeline, \n(Warning: this will lead to unformatted results )\n3. LLM Text Generation - This will include only LLM in the pipeline, \n(Warning: This will give hallucinated and out of context answers_)" +# ) + +# col1, col2, col3, col4 = st.columns(4) + +# with col1: +# st.text_input('Temperature', value = "0.001", placeholder='LLM Temperature', key = 'input_temperature',help = "Set the temperature of the Large Language model. \n Note: 1. Set this to values lower to 1 in the order of 0.001, 0.0001, such low values reduces hallucination and creativity in the LLM response; 2. This applies only when LLM is a part of the retriever pipeline") +# with col2: +# st.number_input('Top K', value = 200, placeholder='Top K', key = 'input_topK', step = 50, help = "This limits the LLM's predictions to the top k most probable tokens at each step of generation, this applies only when LLM is a prt of the retriever pipeline") +# with col3: +# st.number_input('Top P', value = 0.95, placeholder='Top P', key = 'input_topP', step = 0.05, help = "This sets a threshold probability and selects the top tokens whose cumulative probability exceeds the threshold while the tokens are generated by the LLM") +# with col4: +# st.number_input('Max Output Tokens', value = 500, placeholder='Max Output Tokens', key = 'input_maxTokens', step = 100, help = "This decides the total number of tokens generated as the final response. Note: Values greater than 1000 takes longer response time") + +# st.markdown('---') + + +def write_user_message(md): + col1, col2 = st.columns([3,97]) + + with col1: + st.image(USER_ICON, use_container_width='always') + with col2: + #st.warning(md['question']) + + st.markdown("
"+md['question']+"
", unsafe_allow_html = True) + + + +def render_answer(question,answer,index): + + + col1, col2, col_3 = st.columns([4,74,22]) + with col1: + st.image(AI_ICON, use_container_width='always') + with col2: + use_interim_results = False + src_dict = {} + ans_ = answer['answer'] + span_ans = ans_.replace('',"").replace("","") + st.markdown("

"+span_ans+"

",unsafe_allow_html = True) + print("answer['source']") + print("-------------") + print(answer['source']) + print("-------------") + print(answer['last_tool']) + if(answer['last_tool']['name'] in ["generate_images","get_relevant_items_for_image","get_relevant_items_for_text","retrieve_with_hybrid_search","retrieve_with_keyword_search","get_any_general_recommendation"]): + use_interim_results = True + src_dict =json.loads(answer['last_tool']['response'].replace("'",'"')) + print("src_dict") + print("-------------") + print(src_dict) + #if("get_relevant_items_for_text" in src_dict): + if(use_interim_results and answer['last_tool']['name']!= 'generate_images' and answer['last_tool']['name']!= 'get_any_general_recommendation'): + key_ = answer['last_tool']['name'] + + st.write("

",unsafe_allow_html = True) + img_col1, img_col2, img_col3 = st.columns([30,30,40]) + for index,item in enumerate(src_dict[key_]): + response_ = requests.get(item['image']) + img = Image.open(BytesIO(response_.content)) + resizedImg = img.resize((230, 180), Image.Resampling.LANCZOS) + if(index ==0): + with img_col1: + st.image(resizedImg,use_container_width = True,caption = item['title']) + if(index ==1): + with img_col2: + st.image(resizedImg,use_container_width = True,caption = item['title']) + #st.image(parent_dirname+"/retrieved_esci_images/"+item['id']+"_resized.jpg",caption = item['title'],use_container_width = True) + + + if(answer['last_tool']['name'] == "generate_images" or answer['last_tool']['name'] == "get_any_general_recommendation"): + st.write("
",unsafe_allow_html = True) + gen_img_col1, gen_img_col2,gen_img_col2 = st.columns([30,30,30]) + res = src_dict['generate_images'].replace('s3://','') + s3_ = boto3.resource('s3', + aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1') + + key = res.split('/')[1] + s3_stream = s3_.Object("bedrock-video-generation-us-east-1-lbxkrh", key).get()['Body'].read() + img_ = Image.open(BytesIO(s3_stream)) + resizedImg = img_.resize((230, 180), Image.Resampling.LANCZOS) + with gen_img_col1: + st.image(resizedImg,caption = "Generated image for "+key.split(".")[0],use_container_width = True) + st.write("
",unsafe_allow_html = True) + + + + + + + + # def stream_(): + # #use for streaming response on the client side + # for word in ans_.split(" "): + # yield word + " " + # time.sleep(0.04) + # #use for streaming response from Llm directly + # if(isinstance(ans_,botocore.eventstream.EventStream)): + # for event in ans_: + # chunk = event.get('chunk') + + # if chunk: + + # chunk_obj = json.loads(chunk.get('bytes').decode()) + + # if('content_block' in chunk_obj or ('delta' in chunk_obj and 'text' in chunk_obj['delta'])): + # key_ = list(chunk_obj.keys())[2] + # text = chunk_obj[key_]['text'] + + # clear_output(wait=True) + # output.append(text) + # yield text + # time.sleep(0.04) + + + + # if(index == len(st.session_state.questions_)): + # st.write_stream(stream_) + # if(isinstance(st.session_state.answers_[index-1]['answer'],botocore.eventstream.EventStream)): + # st.session_state.answers_[index-1]['answer'] = "".join(output) + # else: + # st.write(ans_) + + + # polly_response = polly_client.synthesize_speech(VoiceId='Joanna', + # OutputFormat='ogg_vorbis', + # Text = ans_, + # Engine = 'neural') + + # audio_col1, audio_col2 = st.columns([50,50]) + # with audio_col1: + # st.audio(polly_response['AudioStream'].read(), format="audio/ogg") + + + + #st.markdown("
"+ans_+"
", unsafe_allow_html = True) + #st.markdown("
Relevant images from the document :
", unsafe_allow_html = True) + #st.write("") + colu1,colu2,colu3 = st.columns([4,82,20]) + if(answer['source']!={}): + with colu2: + with st.expander("Agent Traces:"): + st.write(answer['source']) + # with st.container(): + # if(len(res_img)>0): + # with st.expander("Images:"): + # col3,col4,col5 = st.columns([33,33,33]) + # cols = [col3,col4] + # idx = 0 + # #print(res_img) + # for img_ in res_img: + # if(img_['file'].lower()!='none' and idx < 2): + # img = img_['file'].split(".")[0] + # caption = img_['caption'] + + # with cols[idx]: + + # st.image(parent_dirname+"/figures/"+st.session_state.input_index+"/"+img+".jpg") + # #st.write(caption) + # idx = idx+1 + # #st.markdown("
Sources from the document:
", unsafe_allow_html = True) + # if(len(answer["table"] )>0): + # with st.expander("Table:"): + # df = pd.read_csv(answer["table"][0]['name'],skipinitialspace = True, on_bad_lines='skip',delimiter='`') + # df.fillna(method='pad', inplace=True) + # st.table(df) + # with st.expander("Raw sources:"): + # st.write(answer["source"]) + + + + # with col_3: + + # #st.markdown("
"+",".join(st.session_state.input_rag_searchType)+"
", unsafe_allow_html = True) + + + + # if(index == len(st.session_state.questions_)): + + # rdn_key = ''.join([random.choice(string.ascii_letters) + # for _ in range(10)]) + # currentValue = ''.join(st.session_state.input_rag_searchType)+str(st.session_state.input_is_rerank)+str(st.session_state.input_table_with_sql)+st.session_state.input_index + # oldValue = ''.join(st.session_state.inputs_["rag_searchType"])+str(st.session_state.inputs_["is_rerank"])+str(st.session_state.inputs_["table_with_sql"])+str(st.session_state.inputs_["index"]) + # #print("changing values-----------------") + # def on_button_click(): + # # print("button clicked---------------") + # # print(currentValue) + # # print(oldValue) + # if(currentValue!=oldValue or 1==1): + # #print("----------regenerate----------------") + # st.session_state.input_query = st.session_state.questions_[-1]["question"] + # st.session_state.answers_.pop() + # st.session_state.questions_.pop() + + # handle_input() + # with placeholder.container(): + # render_all() + + # if("currentValue" in st.session_state): + # del st.session_state["currentValue"] + + # try: + # del regenerate + # except: + # pass + + # #print("------------------------") + # #print(st.session_state) + + # placeholder__ = st.empty() + + # placeholder__.button("🔄",key=rdn_key,on_click=on_button_click) + +#Each answer will have context of the question asked in order to associate the provided feedback with the respective question +def write_chat_message(md, q,index): + #res_img = md['image'] + #st.session_state['session_id'] = res['session_id'] to be added in memory + chat = st.container() + with chat: + #print("st.session_state.input_index------------------") + #print(st.session_state.input_index) + render_answer(q,md,index) + +def render_all(): + index = 0 + for (q, a) in zip(st.session_state.questions__, st.session_state.answers__): + index = index +1 + write_user_message(q) + write_chat_message(a, q,index) + +placeholder = st.empty() +with placeholder.container(): + render_all() + +st.markdown("") +col_2, col_3 = st.columns([75,20]) +#col_1, col_2, col_3 = st.columns([7.5,71.5,22]) +# with col_1: +# st.markdown("

Ask:

",unsafe_allow_html=True, help = 'Enter the questions and click on "GO"') + +with col_2: + #st.markdown("") + input = st.text_input( "Ask here",label_visibility = "collapsed",key="input_shopping_query") +with col_3: + #hidden = st.button("RUN",disabled=True,key = "hidden") + # audio_value = st.audio_input("Record a voice message") + # print(audio_value) + play = st.button("GO",on_click=handle_input,key = "play") +#with st.sidebar: + # st.page_link("/home/ubuntu/AI-search-with-amazon-opensearch-service/OpenSearchApp/app.py", label=":orange[Home]", icon="🏠") + # st.subheader(":blue[Sample Data]") + # coln_1,coln_2 = st.columns([70,30]) + # # index_select = st.radio("Choose one index",["UK Housing","Covid19 impacts on Ireland","Environmental Global Warming","BEIR Research"], + # # captions = ['[preview](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/HPI-Jan-2024-Hometrack.pdf)', + # # '[preview](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/covid19_ie.pdf)', + # # '[preview](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/global_warming.pdf)', + # # '[preview](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/BEIR.pdf)'], + # # key="input_rad_index") + # with coln_1: + # index_select = st.radio("Choose one index",["UK Housing","Global Warming stats","Covid19 impacts on Ireland"],key="input_rad_index") + # with coln_2: + # st.markdown("

Preview file

",unsafe_allow_html=True) + # st.write("[:eyes:](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/HPI-Jan-2024-Hometrack.pdf)") + # st.write("[:eyes:](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/global_warming.pdf)") + # st.write("[:eyes:](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/covid19_ie.pdf)") + # #st.write("[:eyes:](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/BEIR.pdf)") + # st.markdown(""" + # + # """,unsafe_allow_html=True) + # # Initialize boto3 to use the S3 client. + # s3_client = boto3.resource('s3') + # bucket=s3_client.Bucket(s3_bucket_) + + # objects = bucket.objects.filter(Prefix="sample_pdfs/") + # urls = [] + + # client = boto3.client('s3') + + # for obj in objects: + # if obj.key.endswith('.pdf'): + + # # Generate the S3 presigned URL + # s3_presigned_url = client.generate_presigned_url( + # ClientMethod='get_object', + # Params={ + # 'Bucket': s3_bucket_, + # 'Key': obj.key + # }, + # ExpiresIn=3600 + # ) + + # # Print the created S3 presigned URL + # print(s3_presigned_url) + # urls.append(s3_presigned_url) + # #st.write("["+obj.key.split('/')[1]+"]("+s3_presigned_url+")") + # st.link_button(obj.key.split('/')[1], s3_presigned_url) + + + # st.subheader(":blue[Your multi-modal documents]") + # pdf_doc_ = st.file_uploader( + # "Upload your PDFs here and click on 'Process'", accept_multiple_files=False) + + + # pdf_docs = [pdf_doc_] + # if st.button("Process"): + # with st.spinner("Processing"): + # if os.path.isdir(parent_dirname+"/pdfs") == False: + # os.mkdir(parent_dirname+"/pdfs") + + # for pdf_doc in pdf_docs: + # print(type(pdf_doc)) + # pdf_doc_name = (pdf_doc.name).replace(" ","_") + # with open(os.path.join(parent_dirname+"/pdfs",pdf_doc_name),"wb") as f: + # f.write(pdf_doc.getbuffer()) + + # request_ = { "bucket": s3_bucket_,"key": pdf_doc_name} + # # if(st.session_state.input_copali_rerank): + # # copali.process_doc(request_) + # # else: + # rag_DocumentLoader.load_docs(request_) + # print('lambda done') + # st.success('you can start searching on your PDF') + + # ############## haystach demo temporary addition ############ + # # st.subheader(":blue[Multimodality]") + # # colu1,colu2 = st.columns([50,50]) + # # with colu1: + # # in_images = st.toggle('Images', key = 'in_images', disabled = False) + # # with colu2: + # # in_tables = st.toggle('Tables', key = 'in_tables', disabled = False) + # # if(in_tables): + # # st.session_state.input_table_with_sql = True + # # else: + # # st.session_state.input_table_with_sql = False + + # ############## haystach demo temporary addition ############ + # if(pdf_doc_ is None or pdf_doc_ == ""): + # if(index_select == "Global Warming stats"): + # st.session_state.input_index = "globalwarmingnew" + # if(index_select == "Covid19 impacts on Ireland"): + # st.session_state.input_index = "covid19ie"#"choosetheknnalgorithmforyourbillionscaleusecasewithopensearchawsbigdatablog" + # if(index_select == "BEIR"): + # st.session_state.input_index = "2104" + # if(index_select == "UK Housing"): + # st.session_state.input_index = "hpijan2024hometrack" + # # if(in_images == True and in_tables == True): + # # st.session_state.input_index = "hpijan2024hometrack" + # # else: + # # if(in_images == True and in_tables == False): + # # st.session_state.input_index = "hpijan2024hometrackno_table" + # # else: + # # if(in_images == False and in_tables == True): + # # st.session_state.input_index = "hpijan2024hometrackno_images" + # # else: + # # st.session_state.input_index = "hpijan2024hometrack_no_img_no_table" + + + # # if(in_images): + # # st.session_state.input_include_images = True + # # else: + # # st.session_state.input_include_images = False + # # if(in_tables): + # # st.session_state.input_include_tables = True + # # else: + # # st.session_state.input_include_tables = False + + # custom_index = st.text_input("If uploaded the file already, enter the original file name", value = "") + # if(custom_index!=""): + # st.session_state.input_index = re.sub('[^A-Za-z0-9]+', '', (custom_index.lower().replace(".pdf","").split("/")[-1].split(".")[0]).lower()) + + + + # st.subheader(":blue[Retriever]") + # search_type = st.multiselect('Select the Retriever(s)', + # ['Keyword Search', + # 'Vector Search', + # 'Sparse Search', + # ], + # ['Sparse Search'], + + # key = 'input_rag_searchType', + # help = "Select the type of Search, adding more than one search type will activate hybrid search"#\n1. Conversational Search (Recommended) - This will include both the OpenSearch and LLM in the retrieval pipeline \n (note: This will put opensearch response as context to LLM to answer) \n2. OpenSearch vector search - This will put only OpenSearch's vector search in the pipeline, \n(Warning: this will lead to unformatted results )\n3. LLM Text Generation - This will include only LLM in the pipeline, \n(Warning: This will give hallucinated and out of context answers)" + # ) + + # re_rank = st.checkbox('Re-rank results', key = 'input_re_rank', disabled = False, value = True, help = "Checking this box will re-rank the results using a cross-encoder model") + + # if(re_rank): + # st.session_state.input_is_rerank = True + # else: + # st.session_state.input_is_rerank = False + + # # copali_rerank = st.checkbox("Search and Re-rank with Token level vectors",key = 'copali_rerank',help = "Enabling this option uses 'Copali' model's page level image embeddings to retrieve documents and MaxSim to re-rank the pages.\n\n Hugging Face Model: https://huggingface.co/vidore/colpali") + + # # if(copali_rerank): + # # st.session_state.input_copali_rerank = True + # # else: + # # st.session_state.input_copali_rerank = False + + diff --git a/pages/Multimodal_Conversational_Search.py b/pages/Multimodal_Conversational_Search.py new file mode 100644 index 0000000000000000000000000000000000000000..9886321712d1cdc3c4694de172fbda67fc250115 --- /dev/null +++ b/pages/Multimodal_Conversational_Search.py @@ -0,0 +1,557 @@ +import streamlit as st +import uuid +import os +import re +import sys +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/semantic_search") +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/RAG") +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities") +import boto3 +import requests +from boto3 import Session +import botocore.session +import json +import random +import string +import rag_DocumentLoader +import rag_DocumentSearcher +import pandas as pd +from PIL import Image +import shutil +import base64 +import time +import botocore +#from langchain.callbacks.base import BaseCallbackHandler +#import streamlit_nested_layout +#from IPython.display import clear_output, display, display_markdown, Markdown +from requests_aws4auth import AWS4Auth +#import copali +from requests.auth import HTTPBasicAuth + + + +st.set_page_config( + #page_title="Semantic Search using OpenSearch", + layout="wide", + page_icon="images/opensearch_mark_default.png" +) +parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) +USER_ICON = "images/user.png" +AI_ICON = "images/opensearch-twitter-card.png" +REGENERATE_ICON = "images/regenerate.png" +s3_bucket_ = "pdf-repo-uploads" + #"pdf-repo-uploads" +polly_client = boto3.client('polly',aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1') + +# Check if the user ID is already stored in the session state +if 'user_id' in st.session_state: + user_id = st.session_state['user_id'] + #print(f"User ID: {user_id}") + +# If the user ID is not yet stored in the session state, generate a random UUID +else: + user_id = str(uuid.uuid4()) + st.session_state['user_id'] = user_id + + +if 'session_id' not in st.session_state: + st.session_state['session_id'] = "" + +if "chats" not in st.session_state: + st.session_state.chats = [ + { + 'id': 0, + 'question': '', + 'answer': '' + } + ] + +if "questions_" not in st.session_state: + st.session_state.questions_ = [] + +if "answers_" not in st.session_state: + st.session_state.answers_ = [] + +if "input_index" not in st.session_state: + st.session_state.input_index = "hpijan2024hometrack"#"globalwarmingnew"#"hpijan2024hometrack_no_img_no_table" + +if "input_is_rerank" not in st.session_state: + st.session_state.input_is_rerank = True + +if "input_copali_rerank" not in st.session_state: + st.session_state.input_copali_rerank = False + +if "input_table_with_sql" not in st.session_state: + st.session_state.input_table_with_sql = False + +if "input_query" not in st.session_state: + st.session_state.input_query="which city has the highest average housing price in UK ?"#"What is the projected energy percentage from renewable sources in future?"#"Which city in United Kingdom has the highest average housing price ?"#"How many aged above 85 years died due to covid ?"# What is the projected energy from renewable sources ?" + + +# if "input_rag_searchType" not in st.session_state: +# st.session_state.input_rag_searchType = ["Sparse Search"] + + + + +region = 'us-east-1' +bedrock_runtime_client = boto3.client('bedrock-runtime',region_name=region) +output = [] +service = 'es' + +st.markdown(""" + + """,unsafe_allow_html=True) + +################ OpenSearch Py client ##################### + +# credentials = boto3.Session().get_credentials() +# awsauth = AWSV4SignerAuth(credentials, region, service) + +# ospy_client = OpenSearch( +# hosts = [{'host': 'search-opensearchservi-75ucark0bqob-bzk6r6h2t33dlnpgx2pdeg22gi.us-east-1.es.amazonaws.com', 'port': 443}], +# http_auth = awsauth, +# use_ssl = True, +# verify_certs = True, +# connection_class = RequestsHttpConnection, +# pool_maxsize = 20 +# ) + +################# using boto3 credentials ################### + + +credentials = boto3.Session().get_credentials() +awsauth = HTTPBasicAuth('prasadnu',st.secrets['rag_shopping_assistant_os_api_access']) +service = 'es' + + +################# using boto3 credentials #################### + + + +# if "input_searchType" not in st.session_state: +# st.session_state.input_searchType = "Conversational Search (RAG)" + +# if "input_temperature" not in st.session_state: +# st.session_state.input_temperature = "0.001" + +# if "input_topK" not in st.session_state: +# st.session_state.input_topK = 200 + +# if "input_topP" not in st.session_state: +# st.session_state.input_topP = 0.95 + +# if "input_maxTokens" not in st.session_state: +# st.session_state.input_maxTokens = 1024 + + +def write_logo(): + col1, col2, col3 = st.columns([5, 1, 5]) + with col2: + st.image(AI_ICON, use_container_width='always') + +def write_top_bar(): + col1, col2 = st.columns([77,23]) + with col1: + st.write("") + st.header("Chat with your data",divider='rainbow') + + #st.image(AI_ICON, use_container_width='always') + + with col2: + st.write("") + st.write("") + clear = st.button("Clear") + st.write("") + st.write("") + return clear + +clear = write_top_bar() + +if clear: + st.session_state.questions_ = [] + st.session_state.answers_ = [] + st.session_state.input_query="" + # st.session_state.input_searchType="Conversational Search (RAG)" + # st.session_state.input_temperature = "0.001" + # st.session_state.input_topK = 200 + # st.session_state.input_topP = 0.95 + # st.session_state.input_maxTokens = 1024 + + +def handle_input(): + print("Question: "+st.session_state.input_query) + print("-----------") + print("\n\n") + if(st.session_state.input_query==''): + return "" + inputs = {} + for key in st.session_state: + if key.startswith('input_'): + inputs[key.removeprefix('input_')] = st.session_state[key] + st.session_state.inputs_ = inputs + + ####### + + + #st.write(inputs) + question_with_id = { + 'question': inputs["query"], + 'id': len(st.session_state.questions_) + } + st.session_state.questions_.append(question_with_id) + out_ = rag_DocumentSearcher.query_(awsauth, inputs, st.session_state['session_id'],st.session_state.input_rag_searchType) + st.session_state.answers_.append({ + 'answer': out_['text'], + 'source':out_['source'], + 'id': len(st.session_state.questions_), + 'image': out_['image'], + 'table':out_['table'] + }) + st.session_state.input_query="" + + + +# search_type = st.selectbox('Select the Search type', +# ('Conversational Search (RAG)', +# 'OpenSearch vector search', +# 'LLM Text Generation' +# ), + +# key = 'input_searchType', +# help = "Select the type of retriever\n1. Conversational Search (Recommended) - This will include both the OpenSearch and LLM in the retrieval pipeline \n (note: This will put opensearch response as context to LLM to answer) \n2. OpenSearch vector search - This will put only OpenSearch's vector search in the pipeline, \n(Warning: this will lead to unformatted results )\n3. LLM Text Generation - This will include only LLM in the pipeline, \n(Warning: This will give hallucinated and out of context answers_)" +# ) + +# col1, col2, col3, col4 = st.columns(4) + +# with col1: +# st.text_input('Temperature', value = "0.001", placeholder='LLM Temperature', key = 'input_temperature',help = "Set the temperature of the Large Language model. \n Note: 1. Set this to values lower to 1 in the order of 0.001, 0.0001, such low values reduces hallucination and creativity in the LLM response; 2. This applies only when LLM is a part of the retriever pipeline") +# with col2: +# st.number_input('Top K', value = 200, placeholder='Top K', key = 'input_topK', step = 50, help = "This limits the LLM's predictions to the top k most probable tokens at each step of generation, this applies only when LLM is a prt of the retriever pipeline") +# with col3: +# st.number_input('Top P', value = 0.95, placeholder='Top P', key = 'input_topP', step = 0.05, help = "This sets a threshold probability and selects the top tokens whose cumulative probability exceeds the threshold while the tokens are generated by the LLM") +# with col4: +# st.number_input('Max Output Tokens', value = 500, placeholder='Max Output Tokens', key = 'input_maxTokens', step = 100, help = "This decides the total number of tokens generated as the final response. Note: Values greater than 1000 takes longer response time") + +# st.markdown('---') + + +def write_user_message(md): + col1, col2 = st.columns([3,97]) + + with col1: + st.image(USER_ICON, use_container_width='always') + with col2: + #st.warning(md['question']) + + st.markdown("
"+md['question']+"
", unsafe_allow_html = True) + + + +def render_answer(question,answer,index,res_img): + + + col1, col2, col_3 = st.columns([4,74,22]) + with col1: + st.image(AI_ICON, use_container_width='always') + with col2: + ans_ = answer['answer'] + st.write(ans_) + + + + # def stream_(): + # #use for streaming response on the client side + # for word in ans_.split(" "): + # yield word + " " + # time.sleep(0.04) + # #use for streaming response from Llm directly + # if(isinstance(ans_,botocore.eventstream.EventStream)): + # for event in ans_: + # chunk = event.get('chunk') + + # if chunk: + + # chunk_obj = json.loads(chunk.get('bytes').decode()) + + # if('content_block' in chunk_obj or ('delta' in chunk_obj and 'text' in chunk_obj['delta'])): + # key_ = list(chunk_obj.keys())[2] + # text = chunk_obj[key_]['text'] + + # clear_output(wait=True) + # output.append(text) + # yield text + # time.sleep(0.04) + + + + # if(index == len(st.session_state.questions_)): + # st.write_stream(stream_) + # if(isinstance(st.session_state.answers_[index-1]['answer'],botocore.eventstream.EventStream)): + # st.session_state.answers_[index-1]['answer'] = "".join(output) + # else: + # st.write(ans_) + + + polly_response = polly_client.synthesize_speech(VoiceId='Joanna', + OutputFormat='ogg_vorbis', + Text = ans_, + Engine = 'neural') + + audio_col1, audio_col2 = st.columns([50,50]) + with audio_col1: + st.audio(polly_response['AudioStream'].read(), format="audio/ogg") + + + + #st.markdown("
"+ans_+"
", unsafe_allow_html = True) + #st.markdown("
Relevant images from the document :
", unsafe_allow_html = True) + #st.write("") + colu1,colu2,colu3 = st.columns([4,82,20]) + with colu2: + #with st.expander("Relevant Sources:"): + with st.container(): + if(len(res_img)>0): + with st.expander("Relevant Sources:"): + #with st.expander("Images:"): + st.write("Images:") + col3,col4,col5 = st.columns([33,33,33]) + cols = [col3,col4] + idx = 0 + #print(res_img) + for img_ in res_img: + if(img_['file'].lower()!='none' and idx < 2): + img = img_['file'].split(".")[0] + caption = img_['caption'] + + with cols[idx]: + + st.image(parent_dirname+"/figures/"+st.session_state.input_index+"/"+img+".jpg") + #st.write(caption) + idx = idx+1 + #st.markdown("
Sources from the document:
", unsafe_allow_html = True) + if(len(answer["table"] )>0): + #with st.expander("Table:"): + st.write("Table:") + df = pd.read_csv(answer["table"][0]['name'],skipinitialspace = True, on_bad_lines='skip',delimiter='`') + df.fillna(method='pad', inplace=True) + st.table(df) + #with st.expander("Raw sources:"): + st.write("Raw sources:") + st.write(answer["source"]) + + + + with col_3: + + #st.markdown("
"+",".join(st.session_state.input_rag_searchType)+"
", unsafe_allow_html = True) + + + + if(index == len(st.session_state.questions_)): + + rdn_key = ''.join([random.choice(string.ascii_letters) + for _ in range(10)]) + currentValue = ''.join(st.session_state.input_rag_searchType)+str(st.session_state.input_is_rerank)+str(st.session_state.input_table_with_sql)+st.session_state.input_index + oldValue = ''.join(st.session_state.inputs_["rag_searchType"])+str(st.session_state.inputs_["is_rerank"])+str(st.session_state.inputs_["table_with_sql"])+str(st.session_state.inputs_["index"]) + #print("changing values-----------------") + def on_button_click(): + # print("button clicked---------------") + # print(currentValue) + # print(oldValue) + if(currentValue!=oldValue or 1==1): + #print("----------regenerate----------------") + st.session_state.input_query = st.session_state.questions_[-1]["question"] + st.session_state.answers_.pop() + st.session_state.questions_.pop() + + handle_input() + with placeholder.container(): + render_all() + + if("currentValue" in st.session_state): + del st.session_state["currentValue"] + + try: + del regenerate + except: + pass + + #print("------------------------") + #print(st.session_state) + + placeholder__ = st.empty() + + placeholder__.button("🔄",key=rdn_key,on_click=on_button_click) + +#Each answer will have context of the question asked in order to associate the provided feedback with the respective question +def write_chat_message(md, q,index): + res_img = md['image'] + #st.session_state['session_id'] = res['session_id'] to be added in memory + chat = st.container() + with chat: + #print("st.session_state.input_index------------------") + #print(st.session_state.input_index) + render_answer(q,md,index,res_img) + +def render_all(): + index = 0 + for (q, a) in zip(st.session_state.questions_, st.session_state.answers_): + index = index +1 + + write_user_message(q) + write_chat_message(a, q,index) + +placeholder = st.empty() +with placeholder.container(): + render_all() + +st.markdown("") +col_2, col_3 = st.columns([75,20]) +#col_1, col_2, col_3 = st.columns([7.5,71.5,22]) +# with col_1: +# st.markdown("

Ask:

",unsafe_allow_html=True, help = 'Enter the questions and click on "GO"') + +with col_2: + #st.markdown("") + input = st.text_input( "Ask here",label_visibility = "collapsed",key="input_query") +with col_3: + #hidden = st.button("RUN",disabled=True,key = "hidden") + play = st.button("GO",on_click=handle_input,key = "play") +with st.sidebar: + st.page_link("app.py", label=":orange[Home]", icon="🏠") + st.subheader(":blue[Sample Data]") + coln_1,coln_2 = st.columns([70,30]) + # index_select = st.radio("Choose one index",["UK Housing","Covid19 impacts on Ireland","Environmental Global Warming","BEIR Research"], + # captions = ['[preview](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/HPI-Jan-2024-Hometrack.pdf)', + # '[preview](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/covid19_ie.pdf)', + # '[preview](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/global_warming.pdf)', + # '[preview](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/BEIR.pdf)'], + # key="input_rad_index") + with coln_1: + index_select = st.radio("Choose one index",["UK Housing","Global Warming stats","Covid19 impacts on Ireland"],key="input_rad_index") + with coln_2: + st.markdown("

Preview file

",unsafe_allow_html=True) + st.write("[:eyes:](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/HPI-Jan-2024-Hometrack.pdf)") + st.write("[:eyes:](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/global_warming.pdf)") + st.write("[:eyes:](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/covid19_ie.pdf)") + #st.write("[:eyes:](https://github.com/aws-samples/AI-search-with-amazon-opensearch-service/blob/b559f82c07dfcca973f457c0a15d6444752553ab/rag/sample_pdfs/BEIR.pdf)") + st.markdown(""" + + """,unsafe_allow_html=True) + # Initialize boto3 to use the S3 client. + s3_client = boto3.resource('s3',aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1') + bucket=s3_client.Bucket(s3_bucket_) + + objects = bucket.objects.filter(Prefix="sample_pdfs/") + urls = [] + + client = boto3.client('s3',aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1') + + for obj in objects: + if obj.key.endswith('.pdf'): + + # Generate the S3 presigned URL + s3_presigned_url = client.generate_presigned_url( + ClientMethod='get_object', + Params={ + 'Bucket': s3_bucket_, + 'Key': obj.key + }, + ExpiresIn=3600 + ) + + # Print the created S3 presigned URL + print(s3_presigned_url) + urls.append(s3_presigned_url) + #st.write("["+obj.key.split('/')[1]+"]("+s3_presigned_url+")") + st.link_button(obj.key.split('/')[1], s3_presigned_url) + + + # st.subheader(":blue[Your multi-modal documents]") + # pdf_doc_ = st.file_uploader( + # "Upload your PDFs here and click on 'Process'", accept_multiple_files=False) + + + # pdf_docs = [pdf_doc_] + # if st.button("Process"): + # with st.spinner("Processing"): + # if os.path.isdir(parent_dirname+"/pdfs") == False: + # os.mkdir(parent_dirname+"/pdfs") + + # for pdf_doc in pdf_docs: + # print(type(pdf_doc)) + # pdf_doc_name = (pdf_doc.name).replace(" ","_") + # with open(os.path.join(parent_dirname+"/pdfs",pdf_doc_name),"wb") as f: + # f.write(pdf_doc.getbuffer()) + + # request_ = { "bucket": s3_bucket_,"key": pdf_doc_name} + # # if(st.session_state.input_copali_rerank): + # # copali.process_doc(request_) + # # else: + # rag_DocumentLoader.load_docs(request_) + # print('lambda done') + # st.success('you can start searching on your PDF') + + + # if(pdf_doc_ is None or pdf_doc_ == ""): + # if(index_select == "Global Warming stats"): + # st.session_state.input_index = "globalwarmingnew" + # if(index_select == "Covid19 impacts on Ireland"): + # st.session_state.input_index = "covid19ie"#"choosetheknnalgorithmforyourbillionscaleusecasewithopensearchawsbigdatablog" + # if(index_select == "BEIR"): + # st.session_state.input_index = "2104" + # if(index_select == "UK Housing"): + # st.session_state.input_index = "ukhousingstats" + + + # custom_index = st.text_input("If uploaded the file already, enter the original file name", value = "") + # if(custom_index!=""): + # st.session_state.input_index = re.sub('[^A-Za-z0-9]+', '', (custom_index.lower().replace(".pdf","").split("/")[-1].split(".")[0]).lower()) + + + + st.subheader(":blue[Retriever]") + search_type = st.multiselect('Select the Retriever(s)', + ['Keyword Search', + 'Vector Search', + 'Sparse Search', + ], + ['Sparse Search'], + + key = 'input_rag_searchType', + help = "Select the type of Search, adding more than one search type will activate hybrid search"#\n1. Conversational Search (Recommended) - This will include both the OpenSearch and LLM in the retrieval pipeline \n (note: This will put opensearch response as context to LLM to answer) \n2. OpenSearch vector search - This will put only OpenSearch's vector search in the pipeline, \n(Warning: this will lead to unformatted results )\n3. LLM Text Generation - This will include only LLM in the pipeline, \n(Warning: This will give hallucinated and out of context answers)" + ) + + re_rank = st.checkbox('Re-rank results', key = 'input_re_rank', disabled = False, value = True, help = "Checking this box will re-rank the results using a cross-encoder model") + + if(re_rank): + st.session_state.input_is_rerank = True + else: + st.session_state.input_is_rerank = False + + # copali_rerank = st.checkbox("Search and Re-rank with Token level vectors",key = 'copali_rerank',help = "Enabling this option uses 'Copali' model's page level image embeddings to retrieve documents and MaxSim to re-rank the pages.\n\n Hugging Face Model: https://huggingface.co/vidore/colpali") + + # if(copali_rerank): + # st.session_state.input_copali_rerank = True + # else: + # st.session_state.input_copali_rerank = False + + diff --git a/pages/Semantic_Search.py b/pages/Semantic_Search.py new file mode 100644 index 0000000000000000000000000000000000000000..805646153514d49f15cda688a42867c4496ee1b5 --- /dev/null +++ b/pages/Semantic_Search.py @@ -0,0 +1,1212 @@ +import streamlit as st +import math +import io +import uuid +import os +import sys +import boto3 +import requests +from requests_aws4auth import AWS4Auth +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/semantic_search") +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/RAG") +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities") +from boto3 import Session +from pathlib import Path +import botocore.session +import subprocess +#import os_index_df_sql +import json +import random +import string +from PIL import Image +import urllib.request +import base64 +import shutil +import re +from requests.auth import HTTPBasicAuth +import utilities.re_ranker as re_ranker +# from nltk.stem import PorterStemmer +# from nltk.tokenize import word_tokenize +import query_rewrite +import amazon_rekognition +#from st_click_detector import click_detector +import llm_eval +import all_search_execute + + + +st.set_page_config( + #page_title="Semantic Search using OpenSearch", + #layout="wide", + page_icon="images/opensearch_mark_default.png" +) +parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) +st.markdown(""" + + """, unsafe_allow_html=True) +#ps = PorterStemmer() + +st.session_state.REGION = 'us-east-1' + + +#from langchain.callbacks.base import BaseCallbackHandler + + +USER_ICON = "images/user.png" +AI_ICON = "images/opensearch-twitter-card.png" +REGENERATE_ICON = "images/regenerate.png" +IMAGE_ICON = "images/Image_Icon.png" +TEXT_ICON = "images/text.png" +s3_bucket_ = "pdf-repo-uploads" + #"pdf-repo-uploads" + +# Check if the user ID is already stored in the session state +if 'user_id' in st.session_state: + user_id = st.session_state['user_id'] + print(f"User ID: {user_id}") + +# If the user ID is not yet stored in the session state, generate a random UUID +# else: +# user_id = str(uuid.uuid4()) +# st.session_state['user_id'] = user_id +# dynamodb = boto3.resource('dynamodb') +# table = dynamodb.Table('ml-search') + + + +if 'session_id' not in st.session_state: + st.session_state['session_id'] = "" + +if 'input_reranker' not in st.session_state: + st.session_state['input_reranker'] = "None"#"Cross Encoder" + +if "chats" not in st.session_state: + st.session_state.chats = [ + { + 'id': 0, + 'question': '', + 'answer': '' + } + ] + +if "questions" not in st.session_state: + st.session_state.questions = [] + +if "clear_" not in st.session_state: + st.session_state.clear_ = False + +if "input_clear_filter" not in st.session_state: + st.session_state.input_clear_filter = False + + +if "radio_disabled" not in st.session_state: + st.session_state.radio_disabled = True + +if "input_rad_1" not in st.session_state: + st.session_state.input_rad_1 = "" + +if "input_manual_filter" not in st.session_state: + st.session_state.input_manual_filter = "" + +if "input_category" not in st.session_state: + st.session_state.input_category = None + +if "input_gender" not in st.session_state: + st.session_state.input_gender = None + +# if "input_price" not in st.session_state: +# st.session_state.input_price = (0,0) + +if "input_sql_query" not in st.session_state: + st.session_state.input_sql_query = "" +if "input_rewritten_query" not in st.session_state: + st.session_state.input_rewritten_query = "" + +if "input_hybridType" not in st.session_state: + st.session_state.input_hybridType = "OpenSearch Hybrid Query" + +if "ndcg_increase" not in st.session_state: + st.session_state.ndcg_increase = " ~ " + +if "inputs_" not in st.session_state: + st.session_state.inputs_ = {} + +if "img_container" not in st.session_state: + st.session_state.img_container = "" + +if "input_rekog_directoutput" not in st.session_state: + st.session_state.input_rekog_directoutput = {} + +if "input_weightage" not in st.session_state: + st.session_state.input_weightage = {} + +if "img_gen" not in st.session_state: + st.session_state.img_gen = [] + +if "answers" not in st.session_state: + st.session_state.answers = [] + +if "answers_none_rank" not in st.session_state: + st.session_state.answers_none_rank = [] + + +if "input_text" not in st.session_state: + st.session_state.input_text="black jacket for men"#"black jacket for men under 120 dollars" + +if "input_ndcg" not in st.session_state: + st.session_state.input_ndcg=0.0 + +if "gen_image_str" not in st.session_state: + st.session_state.gen_image_str="" + +# if "input_searchType" not in st.session_state: +# st.session_state.input_searchType = ['Keyword Search'] + +# if "input_must" not in st.session_state: +# st.session_state.input_must = ["Category","Price","Gender","Style"] + +if "input_NormType" not in st.session_state: + st.session_state.input_NormType = "min_max" + +if "input_CombineType" not in st.session_state: + st.session_state.input_CombineType = "arithmetic_mean" + +if "input_sparse" not in st.session_state: + st.session_state.input_sparse = "disabled" + +if "input_evaluate" not in st.session_state: + st.session_state.input_evaluate = "disabled" + +if "input_is_rewrite_query" not in st.session_state: + st.session_state.input_is_rewrite_query = "disabled" + + +if "input_rekog_label" not in st.session_state: + st.session_state.input_rekog_label = "" + + +if "input_sparse_filter" not in st.session_state: + st.session_state.input_sparse_filter = 0.5 + +if "input_modelType" not in st.session_state: + st.session_state.input_modelType = "Titan-Embed-Text-v1" + +if "input_weight" not in st.session_state: + st.session_state.input_weight = 0.5 + +if "image_prompt2" not in st.session_state: + st.session_state.image_prompt2 = "" + +if "image_prompt" not in st.session_state: + st.session_state.image_prompt = "" + +if "bytes_for_rekog" not in st.session_state: + st.session_state.bytes_for_rekog = "" + +if "OpenSearchDomainEndpoint" not in st.session_state: + st.session_state.OpenSearchDomainEndpoint = "search-opensearchservi-shjckef2t7wo-iyv6rajdgxg6jas25aupuxev6i.us-west-2.es.amazonaws.com" + +if "max_selections" not in st.session_state: + st.session_state.max_selections = "None" + +if "re_ranker" not in st.session_state: + st.session_state.re_ranker = "true" + +host = 'https://'+st.session_state.OpenSearchDomainEndpoint+'/' +service = 'es' +#credentials = boto3.Session().get_credentials() +awsauth = awsauth = HTTPBasicAuth('master',st.secrets['ml_search_demo_api_access']) +headers = {"Content-Type": "application/json"} + +if "REGION" not in st.session_state: + st.session_state.REGION = "" + +if "BEDROCK_MULTIMODAL_MODEL_ID" not in st.session_state: + st.session_state.BEDROCK_MULTIMODAL_MODEL_ID = "p_Qk-ZMBcuw9xT4ly3_B" + +if "search_types" not in st.session_state: + st.session_state.search_types = 'Keyword Search,Vector Search,Multimodal Search,NeuralSparse Search', + +if "KendraResourcePlanID" not in st.session_state: + st.session_state.KendraResourcePlanID= "" + +if "SAGEMAKER_CrossEncoder_MODEL_ID" not in st.session_state: + st.session_state.SAGEMAKER_CrossEncoder_MODEL_ID = "eUoo-ZMBTp0efWqBQ-5g" + + +if "SAGEMAKER_SPARSE_MODEL_ID" not in st.session_state: + st.session_state.SAGEMAKER_SPARSE_MODEL_ID = "fkol-ZMBTp0efWqBcO2P" + +if "BEDROCK_TEXT_MODEL_ID" not in st.session_state: + st.session_state.BEDROCK_TEXT_MODEL_ID = "usQk-ZMBkiQuoz1QFmXN" +#bytes_for_rekog = "" +bedrock_ = boto3.client('bedrock-runtime', + aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1') +search_all_type = True +if(search_all_type==True): + search_types = ['Keyword Search', + 'Vector Search', + 'Multimodal Search', + 'NeuralSparse Search', + ] +from streamlit.components.v1 import html +# with st.container(): +# html(""" +# +# """, width=0, height=0) + + + + + +def generate_images(tab,inp_): + #write_top_bar() + seed = random.randint(1, 10) + request = json.dumps( + { + "taskType": "TEXT_IMAGE", + "textToImageParams": {"text": st.session_state.image_prompt}, + "imageGenerationConfig": { + "numberOfImages": 3, + "quality": "standard", + "cfgScale": 8.0, + "height": 512, + "width": 512, + "seed": seed, + }, + } + ) + + if(inp_!=st.session_state.image_prompt): + print("call bedrocck") + response = bedrock_.invoke_model( + modelId="amazon.titan-image-generator-v1", body=request + ) + + response_body = json.loads(response["body"].read()) + st.session_state.img_gen = response_body["images"] + gen_images_dir = os.path.join(parent_dirname, "gen_images") + if os.path.exists(gen_images_dir): + shutil.rmtree(gen_images_dir) + os.mkdir(gen_images_dir) + width_ = 200 + height_ = 200 + index_ = 0 + #if(inp_!=st.session_state.image_prompt): + + if(len(st.session_state.img_gen)==0 and st.session_state.clear_ == True): + #write_top_bar() + placeholder1 = st.empty() + with tab: + with placeholder1.container(): + st.empty() + + images_dis = [] + for image_ in st.session_state.img_gen: + st.session_state.radio_disabled = False + if(index_==0): + # with tab: + # rad1, rad2,rad3 = st.columns([98,1,1]) + # if(st.session_state.input_rad_1 is None): + # rand_ = "" + # else: + # rand_ = st.session_state.input_rad_1 + # if(inp_!=st.session_state.image_prompt+rand_): + # with rad1: + # sel_rad_1 = st.radio("Choose one image", ["1","2","3"],index=None, horizontal = True,key = 'input_rad_1') + + with tab: + #sel_image = st.radio("", ["1","2","3"],index=None, horizontal = True) + if(st.session_state.img_container!=""): + st.session_state.img_container.empty() + place_ = st.empty() + img1, img2,img3 = place_.columns([30,30,30]) + st.session_state.img_container = place_ + img_arr = [img1, img2,img3] + + base64_image_data = image_ + + #st.session_state.gen_image_str = base64_image_data + + print("perform multimodal search") + + Image.MAX_IMAGE_PIXELS = 100000000 + filename = st.session_state.image_prompt+"_gen_"+str(index_) + photo = parent_dirname+"/gen_images/"+filename+'.jpg' # I assume you have a way of picking unique filenames + imgdata = base64.b64decode(base64_image_data) + with open(photo, 'wb') as f: + f.write(imgdata) + + + + with Image.open(photo) as image: + file_type = 'jpg' + path = image.filename.rsplit(".", 1)[0] + image.thumbnail((width_, height_)) + image.save(parent_dirname+"/gen_images/"+filename+"-resized_display."+file_type) + + with img_arr[index_]: + placeholder_ = st.empty() + placeholder_.image(parent_dirname+"/gen_images/"+filename+"-resized_display."+file_type) + + index_ = index_ + 1 + + + +def handle_input(): + if("text" in st.session_state.inputs_): + if(st.session_state.inputs_["text"] != st.session_state.input_text): + st.session_state.input_ndcg=0.0 + st.session_state.bytes_for_rekog = "" + print("***") + + if(st.session_state.img_doc is not None or (st.session_state.input_rad_1 is not None and st.session_state.input_rad_1!="") ):#and st.session_state.input_searchType == 'Multi-modal Search'): + print("perform multimodal search") + st.session_state.input_imageUpload = 'yes' + if(st.session_state.input_rad_1 is not None and st.session_state.input_rad_1!=""): + + num_str = str(int(st.session_state.input_rad_1.strip())-1) + with open(parent_dirname+"/gen_images/"+st.session_state.image_prompt+"_gen_"+num_str+"-resized_display.jpg", "rb") as image_file: + input_image = base64.b64encode(image_file.read()).decode("utf8") + st.session_state.input_image = input_image + + if(st.session_state.input_imageUpload == 'yes' and 'Keyword Search' in st.session_state.input_searchType): + st.session_state.bytes_for_rekog = Path(parent_dirname+"/gen_images/"+st.session_state.image_prompt+"_gen_"+num_str+".jpg").read_bytes() + else: + Image.MAX_IMAGE_PIXELS = 100000000 + width = 2048 + height = 2048 + uploaded_images = os.path.join(parent_dirname, "uploaded_images") + + if not os.path.exists(uploaded_images): + os.mkdir(uploaded_images) + + with open(os.path.join(parent_dirname+"/uploaded_images",st.session_state.img_doc.name),"wb") as f: + f.write(st.session_state.img_doc.getbuffer()) + photo = parent_dirname+"/uploaded_images/"+st.session_state.img_doc.name + with Image.open(photo) as image: + image.verify() + + with Image.open(photo) as image: + width_ = 200 + height_ = 200 + if image.format.upper() in ["JPEG", "PNG","JPG"]: + path = image.filename.rsplit(".", 1)[0] + org_file_type = st.session_state.img_doc.name.split(".")[1] + image.thumbnail((width, height)) + if(org_file_type.upper()=="PNG"): + file_type = "jpg" + image.convert('RGB').save(f"{path}-resized.{file_type}") + else: + file_type = org_file_type + image.save(f"{path}-resized.{file_type}") + + image.thumbnail((width_, height_)) + image.save(f"{path}-resized_display.{org_file_type}") + + + with open(photo.split(".")[0]+"-resized."+file_type, "rb") as image_file: + input_image = base64.b64encode(image_file.read()).decode("utf8") + st.session_state.input_image = input_image + + if(st.session_state.input_imageUpload == 'yes' and 'Keyword Search' in st.session_state.input_searchType): + st.session_state.bytes_for_rekog = Path(parent_dirname+"/uploaded_images/"+st.session_state.img_doc.name).read_bytes() + + + + + else: + print("no image uploaded") + st.session_state.input_imageUpload = 'no' + st.session_state.input_image = '' + + + inputs = {} + # if(st.session_state.input_imageUpload == 'yes'): + # st.session_state.input_searchType = 'Multi-modal Search' + # if(st.session_state.input_sparse == 'enabled' or st.session_state.input_is_rewrite_query == 'enabled'): + # st.session_state.input_searchType = 'Keyword Search' + if(st.session_state.input_imageUpload == 'yes' and 'Keyword Search' in st.session_state.input_searchType): + old_rekog_label = st.session_state.input_rekog_label + st.session_state.input_rekog_label = amazon_rekognition.extract_image_metadata(st.session_state.bytes_for_rekog) + if(st.session_state.input_text == ""): + st.session_state.input_text = st.session_state.input_rekog_label + + # if(st.session_state.input_imageUpload == 'yes'): + # if(st.session_state.input_searchType!='Multi-modal Search'): + # if(st.session_state.input_searchType=='Keyword Search'): + # if(st.session_state.input_rekognition != 'enabled'): + # st.error('For Keyword Search using images, enable "Enrich metadata for Images" in the left panel',icon = "🚨") + # #st.session_state.input_rekognition = 'enabled' + # st.switch_page('pages/1_Semantic_Search.py') + # #st.stop() + + # else: + # st.error('Please set the search type as "Keyword Search (enabling Enrich metadata for Images) or Multi-modal Search"',icon = "🚨") + # #st.session_state.input_searchType='Multi-modal Search' + # st.switch_page('pages/1_Semantic_Search.py') + # #st.stop() + + + weightage = {} + st.session_state.weights_ = [] + total_weight = 0.0 + counter = 0 + num_search = len(st.session_state.input_searchType) + any_weight_zero = False + for type in st.session_state.input_searchType: + key_weight = "input_"+type.split(" ")[0]+"-weight" + total_weight = total_weight + st.session_state[key_weight] + if(st.session_state[key_weight]==0): + any_weight_zero = True + print(total_weight) + for key in st.session_state: + + if(key.startswith('input_')): + original_key = key.removeprefix('input_') + if('weight' not in key): + inputs[original_key] = st.session_state[key] + else: + if(original_key.split("-")[0] + " Search" in st.session_state.input_searchType): + counter = counter +1 + if(total_weight!=100 or any_weight_zero == True): + extra_weight = 100%num_search + if(counter == num_search): + cal_weight = math.trunc(100/num_search)+extra_weight + else: + cal_weight = math.trunc(100/num_search) + + st.session_state[key] = cal_weight + weightage[original_key] = cal_weight + st.session_state.weights_.append(cal_weight) + else: + weightage[original_key] = st.session_state[key] + st.session_state.weights_.append(st.session_state[key]) + else: + weightage[original_key] = 0.0 + st.session_state[key] = 0.0 + + + + + + + + + + + + + + inputs['weightage']=weightage + st.session_state.input_weightage = weightage + + print("====================") + print(st.session_state.weights_) + print(st.session_state.input_weightage ) + print("====================") + #print("***************************") + #print(sum(weights_)) + # if(sum(st.session_state.weights_)!=100): + # st.warning('The total weight of selected search type(s) should be equal to 100',icon = "🚨") + # refresh = st.button("Re-Enter") + # if(refresh): + # st.switch_page('pages/1_Semantic_Search.py') + # st.stop() + + + # #st.session_state.input_rekognition = 'enabled' + # st.rerun() + + + + st.session_state.inputs_ = inputs + + #st.write(inputs) + question_with_id = { + 'question': inputs["text"], + 'id': len(st.session_state.questions) + } + st.session_state.questions = [] + st.session_state.questions.append(question_with_id) + + st.session_state.answers = [] + + if(st.session_state.input_is_sql_query == 'enabled'): + os_index_df_sql.sql_process(st.session_state.input_text) + print(st.session_state.input_sql_query) + else: + st.session_state.input_sql_query = "" + + + if(st.session_state.input_is_rewrite_query == 'enabled' or (st.session_state.input_imageUpload == 'yes' and 'Keyword Search' in st.session_state.input_searchType)): + query_rewrite.get_new_query_res(st.session_state.input_text) + print("-------------------") + print(st.session_state.input_rewritten_query) + print("-------------------") + else: + st.session_state.input_rewritten_query = "" + + # elif(st.session_state.input_rekog_label!="" and st.session_state.input_rekognition == 'enabled'): + # ans__ = amazon_rekognition.call(st.session_state.input_text,st.session_state.input_rekog_label) + # else: + ans__ = all_search_execute.handler(inputs, st.session_state['session_id']) + + st.session_state.answers.append({ + 'answer': ans__,#all_search_api.call(json.dumps(inputs), st.session_state['session_id']), + 'search_type':inputs['searchType'], + 'id': len(st.session_state.questions) + }) + + st.session_state.answers_none_rank = st.session_state.answers + if(st.session_state.input_reranker == "None"): + st.session_state.answers = st.session_state.answers_none_rank + else: + if(st.session_state.input_reranker == 'Kendra Rescore'): + st.session_state.answers = re_ranker.re_rank("search",st.session_state.input_reranker,st.session_state.input_searchType,st.session_state.questions, st.session_state.answers) + if(st.session_state.input_evaluate == "enabled"): + llm_eval.eval(st.session_state.questions, st.session_state.answers) + #st.session_state.input_text="" + #st.session_state.input_searchType=st.session_state.input_searchType + +def write_top_bar(): + # st.markdown(""" + # + # """,unsafe_allow_html=True) + #print("top bar") + # st.title(':mag: AI powered OpenSearch') + # st.write("") + # st.write("") + col1, col2,col3,col4 = st.columns([2.5,35,8,7]) + with col1: + st.image(TEXT_ICON, use_container_width='always') + with col2: + #st.markdown("") + input = st.text_input( "Ask here",label_visibility = "collapsed",key="input_text",placeholder = "Type your query") + with col3: + play = st.button("Search",on_click=handle_input,key = "play") + + with col4: + clear = st.button("Clear") + + col5, col6 = st.columns([4.5,95]) + + with col5: + st.image(IMAGE_ICON, use_container_width='always') + with col6: + with st.expander(':green[Search by using an image]'): + tab2, tab1 = st.tabs(["Upload Image","Generate Image by AI"]) + + with tab1: + c1,c2 = st.columns([80,20]) + with c1: + gen_images=st.text_area("Text2Image:",placeholder = "Enter the text prompt to generate images",height = 68, key = "image_prompt") + with c2: + st.markdown("
",unsafe_allow_html=True) + st.button("Generate",disabled=False,key = "generate",on_click = generate_images, args=(tab1,"default_img")) + + # image_select = st.select_slider( + # "Select a image", + # options=["Image 1","Image 2","Image 3"], value = None, disabled = st.session_state.radio_disabled,key = "image_select") + image_select = st.radio("Choose one image", ["Image 1","Image 2","Image 3"],index=None, horizontal = True,key = 'image_select',disabled = st.session_state.radio_disabled) + st.markdown(""" + + """,unsafe_allow_html=True) + if(st.session_state.image_select is not None and st.session_state.image_select !="" and len(st.session_state.img_gen)!=0): + print("image_select") + print("------------") + print(st.session_state.image_select) + st.session_state.input_rad_1 = st.session_state.image_select.split(" ")[1] + else: + st.session_state.input_rad_1 = "" + # rad1, rad2,rad3 = st.columns([33,33,33]) + # with rad1: + # btn1 = st.button("choose image 1", disabled = st.session_state.radio_disabled) + # with rad2: + # btn2 = st.button("choose image 2", disabled = st.session_state.radio_disabled) + # with rad3: + # btn3 = st.button("choose image 3", disabled = st.session_state.radio_disabled) + # if(btn1): + # st.session_state.input_rad_1 = "1" + # if(btn2): + # st.session_state.input_rad_1 = "2" + # if(btn3): + # st.session_state.input_rad_1 = "3" + + + generate_images(tab1,gen_images) + + + with tab2: + st.session_state.img_doc = st.file_uploader( + "Upload image", accept_multiple_files=False,type = ['png', 'jpg']) + + + + + + return clear,tab1 + +clear,tab_ = write_top_bar() + +if clear: + + + print("clear1") + st.session_state.questions = [] + st.session_state.answers = [] + + st.session_state.clear_ = True + st.session_state.image_prompt2 = "" + st.session_state.input_rekog_label = "" + + st.session_state.radio_disabled = True + + if(len(st.session_state.img_gen)!=0): + st.session_state.img_container.empty() + st.session_state.img_gen = [] + st.session_state.input_rad_1 = "" + + + # placeholder1 = st.empty() + # with placeholder1.container(): + # generate_images(tab_,st.session_state.image_prompt) + + + #st.session_state.input_text="" + # st.session_state.input_searchType="Conversational Search (RAG)" + # st.session_state.input_temperature = "0.001" + # st.session_state.input_topK = 200 + # st.session_state.input_topP = 0.95 + # st.session_state.input_maxTokens = 1024 + +col1, col3, col4 = st.columns([70,18,12]) + +with col1: + + if(st.session_state.max_selections == "" or st.session_state.max_selections == "1"): + st.session_state.max_selections = 1 + if(st.session_state.max_selections == "None"): + st.session_state.max_selections = None + search_type = st.multiselect('Select the Search type(s)', + search_types,['Keyword Search'], + max_selections = st.session_state.max_selections, + + key = 'input_searchType', + help = "Select the type of Search, adding more than one search type will activate hybrid search"#\n1. Conversational Search (Recommended) - This will include both the OpenSearch and LLM in the retrieval pipeline \n (note: This will put opensearch response as context to LLM to answer) \n2. OpenSearch vector search - This will put only OpenSearch's vector search in the pipeline, \n(Warning: this will lead to unformatted results )\n3. LLM Text Generation - This will include only LLM in the pipeline, \n(Warning: This will give hallucinated and out of context answers)" + ) + +with col3: + st.number_input("No. of docs", min_value=1, max_value=50, value=5, step=5, key='input_K', help=None) +with col4: + st.markdown("
Evaluate
",unsafe_allow_html=True) + evaluate = st.toggle(' ', key = 'evaluate', disabled = False) #help = "Checking this box will use LLM to evaluate results as relevant and irrelevant. \n\n This option increases the latency") + if(evaluate): + st.session_state.input_evaluate = "enabled" + #llm_eval.eval(st.session_state.questions, st.session_state.answers) + else: + st.session_state.input_evaluate = "disabled" + + +if(search_all_type == True or 1==1): + with st.sidebar: + st.page_link("app.py", label=":orange[Home]", icon="🏠") + #st.image('/home/ubuntu/AI-search-with-amazon-opensearch-service/OpenSearchApp/images/service_logo.png', width = 300) + #st.warning('Note: After changing any of the below settings, click "SEARCH" button or 🔄 to apply the changes', icon="⚠️") + #st.header(' :gear: :orange[Fine-tune Search]') + #st.write("Note: After changing any of the below settings, click 'SEARCH' button or '🔄' to apply the changes") + #st.subheader(':blue[Keyword Search]') + + ########################## enable for query_rewrite ######################## + rewrite_query = st.checkbox('Auto-apply filters', key = 'query_rewrite', disabled = False, help = "Checking this box will use LLM to rewrite your query. \n\n Here your natural language query is transformed into OpenSearch query with added filters and attributes") + st.multiselect('Fields for "MUST" filter', + ('Price','Gender', 'Color', 'Category', 'Style'),['Category'], + + key = 'input_must', + ) + ########################## enable for query_rewrite ######################## + ####### Filters ######### + + st.subheader(':blue[Filters]') + def clear_filter(): + st.session_state.input_manual_filter="False" + st.session_state.input_category=None + st.session_state.input_gender=None + st.session_state.input_price=(0,0) + handle_input() + filter_place_holder = st.container() + with filter_place_holder: + st.selectbox("Select one Category", ("accessories", "books","floral","furniture","hot_dispensed","jewelry","tools","apparel","cold_dispensed","food_service","groceries","housewares","outdoors","salty_snacks","videos","beauty","electronics","footwear","homedecor","instruments","seasonal"),index = None,key = "input_category") + st.selectbox("Select one Gender", ("male","female"),index = None,key = "input_gender") + st.slider("Select a range of price", 0, 2000, (0, 0),50, key = "input_price") + + if(st.session_state.input_category!=None or st.session_state.input_gender!=None or st.session_state.input_price!=(0,0)): + st.session_state.input_manual_filter="True" + else: + st.session_state.input_manual_filter="False" + + + clear_filter = st.button("Clear Filters",on_click=clear_filter) + + +# filter_place_holder = st.container() +# with filter_place_holder: +# st.selectbox("Select one Category", ("accessories", "books","floral","furniture","hot_dispensed","jewelry","tools","apparel","cold_dispensed","food_service","groceries","housewares","outdoors","salty_snacks","videos","beauty","electronics","footwear","homedecor","instruments","seasonal"),index = None,key = "input_category") +# st.selectbox("Select one Gender", ("male","female"),index = None,key = "input_gender") +# st.slider("Select a range of price", 0, 2000, (0, 0),50, key = "input_price") + +# st.session_state.input_category=None +# st.session_state.input_gender=None +# st.session_state.input_price=(0,0) + + print("--------------------filters---------------") + print(st.session_state.input_gender) + print(st.session_state.input_manual_filter) + print("--------------------filters---------------") + + + + ####### Filters ######### + + if('NeuralSparse Search' in st.session_state.search_types): + st.subheader(':blue[Neural Sparse Search]') + sparse_filter = st.slider('Keep only sparse tokens with weight >=', 0.0, 1.0, 0.5,0.1,key = 'input_sparse_filter', help = 'Use this slider to set the minimum weight that the sparse vector token weights should meet, rest are filtered out') + + + #sql_query = st.checkbox('Re-write as SQL query', key = 'sql_rewrite', disabled = True, help = "In Progress") + st.session_state.input_is_rewrite_query = 'disabled' + st.session_state.input_is_sql_query = 'disabled' + + ########################## enable for query_rewrite ######################## + if rewrite_query: + #st.write(st.session_state.inputs_) + st.session_state.input_is_rewrite_query = 'enabled' + # if sql_query: + # #st.write(st.session_state.inputs_) + # st.session_state.input_is_sql_query = 'enabled' + ########################## enable for sql conversion ######################## + + + #st.markdown('---') + #st.header('Fine-tune keyword Search', divider='rainbow') + #st.subheader('Note: The below selection applies only when the Search type is set to Keyword Search') + + + # st.markdown("Enrich metadata for :",unsafe_allow_html=True) + + + + # c3,c4 = st.columns([10,90]) + # with c4: + # rekognition = st.checkbox('Images', key = 'rekognition', help = "Checking this box will use AI to extract metadata for images that are present in query and documents") + # if rekognition: + # #st.write(st.session_state.inputs_) + # st.session_state.input_rekognition = 'enabled' + # else: + # st.session_state.input_rekognition = "disabled" + + #st.markdown('---') + #st.header('Fine-tune Hybrid Search', divider='rainbow') + #st.subheader('Note: The below parameters apply only when the Search type is set to Hybrid Search') + + + + + + + + #st.write("---") + #if(st.session_state.max_selections == "None"): + st.subheader(':blue[Hybrid Search]') + # st.selectbox('Select the Hybrid Search type', + # ("OpenSearch Hybrid Query","Reciprocal Rank Fusion"),key = 'input_hybridType') + # equal_weight = st.button("Give equal weights to selected searches") + + + + + + + #st.warning('Weight of each of the selected search type should be greater than 0 and the total weight of all the selected search type(s) should be equal to 100',icon = "⚠️") + + + #st.markdown("

Set Weights

",unsafe_allow_html=True) + + with st.expander("Set query Weightage:"): + st.number_input("Keyword %", min_value=0, max_value=100, value=100, step=5, key='input_Keyword-weight', help=None) + st.number_input("Vector %", min_value=0, max_value=100, value=0, step=5, key='input_Vector-weight', help=None) + st.number_input("Multimodal %", min_value=0, max_value=100, value=0, step=5, key='input_Multimodal-weight', help=None) + st.number_input("NeuralSparse %", min_value=0, max_value=100, value=0, step=5, key='input_NeuralSparse-weight', help=None) + + # if(equal_weight): + # counter = 0 + # num_search = len(st.session_state.input_searchType) + # weight_type = ["input_Keyword-weight","input_Vector-weight","input_Multimodal-weight","input_NeuralSparse-weight"] + # for type in weight_type: + # if(type.split("-")[0].replace("input_","")+ " Search" in st.session_state.input_searchType): + # print("ssssssssssss") + # counter = counter +1 + # extra_weight = 100%num_search + # if(counter == num_search): + # cal_weight = math.trunc(100/num_search)+extra_weight + # else: + # cal_weight = math.trunc(100/num_search) + # st.session_state[weight_type] = cal_weight + # else: + # st.session_state[weight_type] = 0 + #weight = st.slider('Weight for Vector Search', 0.0, 1.0, 0.5,0.1,key = 'input_weight', help = 'Use this slider to set the weightage for keyword and vector search, higher values of the slider indicate the increased weightage for semantic search.\n\n This applies only when the search type is set to Hybrid Search') + # st.selectbox('Select the Normalisation type', + # ('min_max', + # 'l2' + # ), + #st.write("---") + # key = 'input_NormType', + # disabled = True, + # help = "Select the type of Normalisation to be applied on the two sets of scores" + # ) + + # st.selectbox('Select the Score Combination type', + # ('arithmetic_mean','geometric_mean','harmonic_mean' + # ), + + # key = 'input_CombineType', + # disabled = True, + # help = "Select the Combination strategy to be used while combining the two scores of the two search queries for every document" + # ) + + #st.markdown('---') + + #st.header('Select the ML Model for text embedding', divider='rainbow') + #st.subheader('Note: The below selection applies only when the Search type is set to Vector or Hybrid Search') + if(st.session_state.re_ranker == "true"): + st.subheader(':blue[Re-ranking]') + reranker = st.selectbox('Choose a Re-Ranker', + ('None','Cross Encoder'#'Kendra Rescore' + + ), + + key = 'input_reranker', + help = 'Select the Re-Ranker type, select "None" to apply no re-ranking of the results', + #on_change = re_ranker.re_rank, + args=(st.session_state.questions, st.session_state.answers) + + ) + # st.write("---") + # st.subheader('Text Embeddings Model') + # model_type = st.selectbox('Select the Text Embeddings Model', + # ('Titan-Embed-Text-v1','GPT-J-6B' + + # ), + + # key = 'input_modelType', + # help = "Select the Text embedding model, this applies only for the vector and hybrid search" + # ) + + #st.markdown('---') + + + + + + + +#st.markdown('---') + + +def write_user_message(md,ans): + #print(ans) + ans = ans["answer"][0] + col1, col2, col3 = st.columns([3,40,20]) + + with col1: + st.image(USER_ICON, use_container_width='always') + with col2: + #st.warning(md['question']) + st.markdown("
Input Text:
"+md['question']+"
", unsafe_allow_html = True) + if('query_sparse' in ans): + with st.expander("Expanded Query:"): + query_sparse = dict(sorted(ans['query_sparse'].items(), key=lambda item: item[1],reverse=True)) + filtered_query_sparse = dict() + for key in query_sparse: + filtered_query_sparse[key] = round(query_sparse[key], 2) + st.write(filtered_query_sparse) + if(st.session_state.input_is_rewrite_query == "enabled" and st.session_state.input_rewritten_query !=""): + with st.expander("Re-written Query:"): + st.json(st.session_state.input_rewritten_query,expanded = True) + + + with col3: + st.markdown("
Input Image:
", unsafe_allow_html = True) + + if(st.session_state.input_imageUpload == 'yes'): + + if(st.session_state.input_rad_1 is not None and st.session_state.input_rad_1!=""): + num_str = str(int(st.session_state.input_rad_1.strip())-1) + img_file = parent_dirname+"/gen_images/"+st.session_state.image_prompt+"_gen_"+num_str+"-resized_display.jpg" + else: + img_file = parent_dirname+"/uploaded_images/"+st.session_state.img_doc.name.split(".")[0]+"-resized_display."+st.session_state.img_doc.name.split(".")[1] + + st.image(img_file) + if(st.session_state.input_rekog_label !=""): + with st.expander("Enriched Query Metadata:"): + st.markdown('

'+json.dumps(st.session_state.input_rekog_directoutput)+'

',unsafe_allow_html=True) + else: + st.markdown("

None
", unsafe_allow_html = True) + + + + + st.markdown('---') + + +# def stem_(sentence): +# words = word_tokenize(sentence) + +# words_stem = [] + +# for w in words: +# words_stem.append( ps.stem(w)) +# return words_stem + +def render_answer(answer,index): + column1, column2 = st.columns([6,90]) + with column1: + st.image(AI_ICON, use_container_width='always') + with column2: + st.markdown("
Results
", unsafe_allow_html = True) + if(st.session_state.input_evaluate == "enabled" and st.session_state.input_ndcg > 0): + span_color = "white" + if("↑" in st.session_state.ndcg_increase): + span_color = "green" + if("↓" in st.session_state.ndcg_increase): + span_color = "red" + st.markdown("Relevance:" +str('%.3f'%(st.session_state.input_ndcg)) + ""+st.session_state.ndcg_increase.split("~")[0] +" "+st.session_state.ndcg_increase.split("~")[1]+"", unsafe_allow_html = True) + + + #st.markdown(""+st.session_state.ndcg_increase.split("~")[0] +""+st.session_state.ndcg_increase.split("~")[1]+"",unsafe_allow_html = True) + + + + placeholder_no_results = st.empty() + + col_1, col_2,col_3 = st.columns([70,10,20]) + i = 0 + filter_out = 0 + for ans in answer: + + + + if('b5/b5319e00' in ans['image_url'] ): + filter_out+=1 + continue + + + # imgdata = base64.b64decode(ans['image_binary']) + format_ = ans['image_url'].split(".")[-1] + + #urllib.request.urlretrieve(ans['image_url'], "/home/ubuntu/res_images/"+str(i)+"_."+format_) + + + Image.MAX_IMAGE_PIXELS = 100000000 + + width = 500 + height = 500 + + + + with col_1: + inner_col_1,inner_col_2 = st.columns([8,92]) + with inner_col_2: + st.image(ans['image_url'].replace("/home/ec2-user/SageMaker/","/home/user/")) + + if("highlight" in ans and 'Keyword Search' in st.session_state.input_searchType): + test_strs = ans["highlight"] + tag = "em" + res__ = [] + for test_str in test_strs: + start_idx = test_str.find("<" + tag + ">") + + while start_idx != -1: + end_idx = test_str.find("", start_idx) + if end_idx == -1: + break + res__.append(test_str[start_idx+len(tag)+2:end_idx]) + start_idx = test_str.find("<" + tag + ">", end_idx) + + + desc__ = ans['desc'].split(" ") + + final_desc = "

" + + ###### stemming and highlighting + + # ans_text = ans['desc'] + # query_text = st.session_state.input_text + + # ans_text_stemmed = set(stem_(ans_text)) + # query_text_stemmed = set(stem_(query_text)) + + # common = ans_text_stemmed.intersection( query_text_stemmed) + # #unique = set(document_1_words).symmetric_difference( ) + + # desc__stemmed = stem_(desc__) + + # for word_ in desc__stemmed: + # if(word_ in common): + + + for word in desc__: + if(re.sub('[^A-Za-z0-9]+', '', word) in res__): + final_desc += ""+word+" " + else: + final_desc += word + " " + + final_desc += "

" + + st.markdown(final_desc,unsafe_allow_html = True) + else: + st.write(ans['desc']) + if("sparse" in ans): + with st.expander("Expanded document:"): + sparse_ = dict(sorted(ans['sparse'].items(), key=lambda item: item[1],reverse=True)) + filtered_sparse = dict() + for key in sparse_: + if(sparse_[key]>=1.0): + filtered_sparse[key] = round(sparse_[key], 2) + st.write(filtered_sparse) + with st.expander("Document Metadata:",expanded = False): + # if("rekog" in ans): + # div_size = [50,50] + # else: + # div_size = [99,1] + # div1,div2 = st.columns(div_size) + # with div1: + + st.write(":green[default:]") + st.json({"category:":ans['category'],"price":str(ans['price']),"gender_affinity":ans['gender_affinity'],"style":ans['style']},expanded = True) + #with div2: + if("rekog" in ans): + st.write(":green[enriched:]") + st.json(ans['rekog'],expanded = True) + with inner_col_1: + + if(st.session_state.input_evaluate == "enabled"): + with st.container(border = False): + if("relevant" in ans.keys()): + if(ans['relevant']==True): + st.write(":white_check_mark:") + else: + st.write(":x:") + + i = i+1 + # with col_2: + # if(st.session_state.input_evaluate == "enabled"): + # st.markdown("
DCG: " +str('%.3f'%(st.session_state.input_ndcg)) + "
", unsafe_allow_html = True) + # with col_2_b: + # span_color = "white" + # if("↑" in st.session_state.ndcg_increase): + # span_color = "green" + # if("↓" in st.session_state.ndcg_increase): + # span_color = "red" + # st.markdown(""+st.session_state.ndcg_increase.split("~")[0] +""+st.session_state.ndcg_increase.split("~")[1]+"",unsafe_allow_html = True) + + + with col_3: + if(index == len(st.session_state.questions)): + + rdn_key = ''.join([random.choice(string.ascii_letters) + for _ in range(10)]) + currentValue = "".join(st.session_state.input_searchType)+st.session_state.input_imageUpload+json.dumps(st.session_state.input_weightage)+st.session_state.input_NormType+st.session_state.input_CombineType+str(st.session_state.input_K)+st.session_state.input_sparse+st.session_state.input_reranker+st.session_state.input_is_rewrite_query+st.session_state.input_evaluate+st.session_state.input_image+st.session_state.input_rad_1+st.session_state.input_reranker+st.session_state.input_hybridType+st.session_state.input_manual_filter + oldValue = "".join(st.session_state.inputs_["searchType"])+st.session_state.inputs_["imageUpload"]+str(st.session_state.inputs_["weightage"])+st.session_state.inputs_["NormType"]+st.session_state.inputs_["CombineType"]+str(st.session_state.inputs_["K"])+st.session_state.inputs_["sparse"]+st.session_state.inputs_["reranker"]+st.session_state.inputs_["is_rewrite_query"]+st.session_state.inputs_["evaluate"]+st.session_state.inputs_["image"]+st.session_state.inputs_["rad_1"]+st.session_state.inputs_["reranker"]+st.session_state.inputs_["hybridType"]+st.session_state.inputs_["manual_filter"] + + def on_button_click(): + if(currentValue!=oldValue): + st.session_state.input_text = st.session_state.questions[-1]["question"] + st.session_state.answers.pop() + st.session_state.questions.pop() + + handle_input() + #re_ranker.re_rank(st.session_state.questions, st.session_state.answers) + with placeholder.container(): + render_all() + + + + if("currentValue" in st.session_state): + del st.session_state["currentValue"] + + try: + del regenerate + except: + pass + + print("------------------------") + #print(st.session_state) + + placeholder__ = st.empty() + + placeholder__.button("🔄",key=rdn_key,on_click=on_button_click, help = "This will regenerate the responses with new settings that you entered, Note: To see difference in responses, you should change any of the applicable settings")#,type="primary",use_container_width=True) + + if(filter_out > 0): + placeholder_no_results.text(str(filter_out)+" result(s) removed due to missing or in-appropriate content") + + + +#Each answer will have context of the question asked in order to associate the provided feedback with the respective question +def write_chat_message(md, q,index): + if('body' in md['answer']): + res = json.loads(md['answer']['body']) + else: + res = md['answer'] + st.session_state['session_id'] = "1234" + chat = st.container() + with chat: + render_answer(res,index) + +def render_all(): + index = 0 + for (q, a) in zip(st.session_state.questions, st.session_state.answers): + index = index +1 + #print("answers----") + #print(a) + ans_ = st.session_state.answers[0] + write_user_message(q,ans_) + write_chat_message(a, q,index) + +placeholder = st.empty() +with placeholder.container(): + render_all() + + #generate_images("",st.session_state.image_prompt) + +st.markdown("") diff --git a/pdfs/UK_housing_stats.pdf b/pdfs/UK_housing_stats.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7cc35e26ef09e5f5574ba38ee06bf21af7982bb5 Binary files /dev/null and b/pdfs/UK_housing_stats.pdf differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..77a8768fdfd0c01ac29aef80da40c47687435c41 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +streamlit==1.38.0 +boto3==1.35.4 +requests_aws4auth==1.3.1 +opensearch-py==2.7.1 +sentence-transformers==3.1.0 +nltk +ruamel_yaml +tabulate +pdf2image +python-dateutil +poppler-utils +unstructured[all-docs] +PyPDF2 +langchain==0.2.16 +langchain-core==0.2.39 +langchain-community==0.2.16 +langchain-experimental==0.0.65 +lark==1.2.2 \ No newline at end of file diff --git a/semantic_search/all_search_execute.py b/semantic_search/all_search_execute.py new file mode 100644 index 0000000000000000000000000000000000000000..d25433e3316e9bc966b356a868efcc728ff447ea --- /dev/null +++ b/semantic_search/all_search_execute.py @@ -0,0 +1,558 @@ +''' +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +SPDX-License-Identifier: MIT-0 +''' + +from collections import namedtuple +from datetime import datetime, timedelta +from dateutil import tz, parser +import itertools +import json +import os +import time +import uuid +import requests +from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth +from requests_aws4auth import AWS4Auth +from requests.auth import HTTPBasicAuth +from datetime import datetime +import boto3 +import streamlit as st + + + + +current_date_time = (datetime.now()).isoformat() +today_ = datetime.today().strftime('%Y-%m-%d') + + +def handler(input_,session_id): + DOMAIN_ENDPOINT = st.session_state.OpenSearchDomainEndpoint #"search-opensearchservi-rimlzstyyeih-3zru5p2nxizobaym45e5inuayq.us-west-2.es.amazonaws.com" + REGION = st.session_state.REGION + #SAGEMAKER_MODEL_ID = st.session_state.SAGEMAKER_MODEL_ID + BEDROCK_TEXT_MODEL_ID = st.session_state.BEDROCK_TEXT_MODEL_ID + BEDROCK_MULTIMODAL_MODEL_ID = st.session_state.BEDROCK_MULTIMODAL_MODEL_ID + SAGEMAKER_SPARSE_MODEL_ID = st.session_state.SAGEMAKER_SPARSE_MODEL_ID + SAGEMAKER_CrossEncoder_MODEL_ID = st.session_state.SAGEMAKER_CrossEncoder_MODEL_ID + print("BEDROCK_TEXT_MODEL_ID") + print(BEDROCK_TEXT_MODEL_ID) + + ####### Hybrid Search weights logic for throwing warning to users for inappropriate weights ####### + + # def my_filtering_function(pair): + # key, value = pair + # if key.split("-")[0] + " Search" in st.session_state["inputs_"]["searchType"]: + # return True # keep pair in the filtered dictionary + # else: + # return False # filter pair out of the dictionary + + + # filtered_search = dict(filter(my_filtering_function, st.session_state.input_weightage.items())) + + # search_types_used = ", ".join(st.session_state["inputs_"]["searchType"]) + + # if((sum(st.session_state.weights_)!=100 or len(st.session_state["inputs_"]["searchType"])!=len(list(filter(lambda a: a >0, st.session_state.weights_)))) and len(st.session_state["inputs_"]["searchType"])!=1): + # st.warning('User Input Error for **WEIGHTS** :-\n\nOne or both of the below conditions was not satisfied, \n1. The total weight of all the selected search type(s): "'+search_types_used+'" should be equal to 100 \n 2. The weight of each of the search types, "'+search_types_used+'" should be greater than 0 \n\n Entered input: '+json.dumps(filtered_search)+'\n\n Please re-enter your weights to satisfy the above conditions and try again',icon = "🚨") + # refresh = st.button("Re-Enter") + # if(refresh): + # st.switch_page('pages/1_Semantic_Search.py') + # st.stop() + + ####### Auth and connection for OpenSearch domain ####### + credentials = boto3.Session().get_credentials() + awsauth = HTTPBasicAuth('master',st.secrets['ml_search_demo_api_access']) + host = 'https://'+DOMAIN_ENDPOINT+'/' + headers = {"Content-Type": "application/json"} + + + ####### Parsing Inputs from user ####### + + print("*********") + print(input_) + search_types = input_["searchType"] + + if("NormType" not in input_.keys()): + norm_type = "min_max" + else: + norm_type = input_["NormType"] + + if("CombineType" not in input_.keys()): + combine_type = "arithmetic_mean" + else: + combine_type = input_["CombineType"] + + if("weight" not in input_.keys()): + semantic_weight = 0.5 + else: + semantic_weight = input_["weight"] + + + + query = input_["text"] + img = input_["image"] + + if("sparse" not in input_.keys()): + sparse = "disabled" + else: + sparse = input_["sparse"] + + + k_ = input_["K"] + image_upload = input_["imageUpload"] + + + + num_queries = len(search_types) + + weights = [] + + searches = ['Keyword','Vector','Multimodal','NeuralSparse'] + for i in searches: + weight = input_['weightage'][i+'-weight']/100 + if(weight>0.0): + weights.append(weight) + + + + ######## Updating hybrid Search pipeline ####### + print("Updating Search pipeline with new weights") + s_pipeline_payload = {"version": 1234} + s_pipeline_payload["phase_results_processors"] = [ + { + "normalization-processor": { + "normalization": { + "technique": norm_type + }, + "combination": { + "technique": combine_type, + "parameters": { + "weights": weights + } + } + } + } + ] + + + + opensearch_search_pipeline = (requests.get(host+'_search/pipeline/hybrid_search_pipeline', auth=awsauth,headers=headers)).text + print("opensearch_search_pipeline") + print(opensearch_search_pipeline) + if(opensearch_search_pipeline!='{}'): + path = "_search/pipeline/hybrid_search_pipeline" + url = host + path + r = requests.put(url, auth=awsauth, json=s_pipeline_payload, headers=headers) + print("Hybrid Search Pipeline updated: "+str(r.status_code)) + ######## Combining hybrid+rerank pipeline ####### + opensearch_rerank_pipeline = (requests.get(host+'_search/pipeline/rerank_pipeline', auth=awsauth,headers=headers)).text + print("opensearch_rerank_pipeline") + print(opensearch_rerank_pipeline) + + + + ######## start of Applying LLM filters ####### + if(st.session_state.input_rewritten_query!=""): + filter_ = {"filter": { + "bool": { + "must": []}}} + filter_['filter']['bool']['must'] = st.session_state.input_rewritten_query['query']['bool']['must'] + ######## end of Applying LLM filters ####### + + ######### Create the queries for hybrid search ######### + + + path = "demostore-search-index/_search" + + url = host + path + + hybrid_payload = { + "_source": { + "exclude": [ + "product_description_vector","product_multimodal_vector","product_image" + ] + }, + "query": { + "hybrid": { + "queries": [ + + #1. keyword query + #2. vector search query + #3. multimodal query + #4. Sparse query + + ] + } + },"size":k_, + "highlight": { + "fields": { + "product_description": {} + } + }} + + + + if('Keyword Search' in search_types): + + keyword_payload = { + "match": { + "product_description": { + "query": query + } + } + } + if(st.session_state.input_rewritten_query !=""): + keyword_payload = st.session_state.input_rewritten_query['query'] + + if(st.session_state.input_manual_filter == "True"): + keyword_payload['bool']={'filter':[]} + if(st.session_state.input_category!=None): + keyword_payload['bool']['filter'].append({"term": {"category": st.session_state.input_category}}) + if(st.session_state.input_gender!=None): + keyword_payload['bool']['filter'].append({"term": {"gender_affinity": st.session_state.input_gender}}) + if(st.session_state.input_price!=(0,0)): + keyword_payload['bool']['filter'].append({"range": {"price": {"gte": st.session_state.input_price[0],"lte": st.session_state.input_price[1] }}}) + + keyword_payload['bool']['must'] = [{ + "match": { + "product_description": { + "query": query + } + } + }] + del keyword_payload['match'] +# print("keyword_payload**************") +# print(keyword_payload) + + + hybrid_payload["query"]["hybrid"]["queries"].append(keyword_payload) + + if('Vector Search' in search_types): + +# path3 = "_plugins/_ml/models/"+BEDROCK_TEXT_MODEL_ID+"/_predict" + +# url3 = host+path3 + +# payload3 = { +# "parameters": { +# "inputText": query +# } +# } + +# r3 = requests.post(url3, auth=awsauth, json=payload3, headers=headers) +# vector_ = json.loads(r3.text) +# #print(r3.text) +# query_vector = vector_['inference_results'][0]['output'][0]['data'] +# #print(query_vector) + +# vector_payload = { +# "knn": { +# "product_description_vector": { +# "vector":query_vector, +# #"query_text": query, +# #"model_id": BEDROCK_TEXT_MODEL_ID, +# "k": k_ +# } +# } +# } + + #using neural query + vector_payload = { + "neural": { + "product_description_vector": { + "query_text": query, + "model_id": BEDROCK_TEXT_MODEL_ID, + "k": k_ + } + } + } + + ###### start of efficient filter applying ##### + if(st.session_state.input_rewritten_query!=""): + vector_payload['neural']['product_description_vector']['filter'] = filter_['filter'] + + if(st.session_state.input_manual_filter == "True"): + vector_payload['neural']['product_description_vector']['filter'] = {"bool":{"must":[]}} + if(st.session_state.input_category!=None): + vector_payload['neural']['product_description_vector']['filter']["bool"]["must"].append({"term": {"category": st.session_state.input_category}}) + if(st.session_state.input_gender!=None): + vector_payload['neural']['product_description_vector']['filter']["bool"]["must"].append({"term": {"gender_affinity": st.session_state.input_gender}}) + if(st.session_state.input_price!=(0,0)): + vector_payload['neural']['product_description_vector']['filter']["bool"]["must"].append({"range": {"price": {"gte": st.session_state.input_price[0],"lte": st.session_state.input_price[1] }}}) + +# print("vector_payload**************") +# print(vector_payload) + + ###### end of efficient filter applying ##### + + hybrid_payload["query"]["hybrid"]["queries"].append(vector_payload) + + if('Multimodal Search' in search_types): + + multimodal_payload = { + + "neural": { + "product_multimodal_vector": { + + "model_id": BEDROCK_MULTIMODAL_MODEL_ID, + "k": k_ + } + } + } + + + if(image_upload == 'yes' and query == ""): + multimodal_payload["neural"]["product_multimodal_vector"]["query_image"] = img + if(image_upload == 'no' and query != ""): + multimodal_payload["neural"]["product_multimodal_vector"]["query_text"] = query + if(image_upload == 'yes' and query != ""): + + multimodal_payload["neural"]["product_multimodal_vector"]["query_image"] = img + multimodal_payload["neural"]["product_multimodal_vector"]["query_text"] = query + + ###### start of efficient filter applying ##### + if(st.session_state.input_rewritten_query!=""): + multimodal_payload['neural']['product_multimodal_vector']['filter'] = filter_['filter'] + + if(st.session_state.input_manual_filter == "True"): + print("presence of filters------------") + multimodal_payload['neural']['product_multimodal_vector']['filter'] = {"bool":{"must":[]}} + if(st.session_state.input_category!=None): + multimodal_payload['neural']['product_multimodal_vector']['filter']["bool"]["must"].append({"term": {"category": st.session_state.input_category}}) + if(st.session_state.input_gender!=None): + multimodal_payload['neural']['product_multimodal_vector']['filter']["bool"]["must"].append({"term": {"gender_affinity": st.session_state.input_gender}}) + if(st.session_state.input_price!=(0,0)): + multimodal_payload['neural']['product_multimodal_vector']['filter']["bool"]["must"].append({"range": {"price": {"gte": st.session_state.input_price[0],"lte": st.session_state.input_price[1] }}}) + +# print("vector_payload**************") +# print(vector_payload) + + ###### end of efficient filter applying ##### + + hybrid_payload["query"]["hybrid"]["queries"].append(multimodal_payload) + + + + + if('NeuralSparse Search' in search_types): + + path2 = "_plugins/_ml/models/"+SAGEMAKER_SPARSE_MODEL_ID+"/_predict" + + url2 = host+path2 + + payload2 = { + "parameters": { + "inputs": query + } + } + + r2 = requests.post(url2, auth=awsauth, json=payload2, headers=headers) + sparse_ = json.loads(r2.text) + query_sparse = sparse_["inference_results"][0]["output"][0]["dataAsMap"]["response"][0] + query_sparse_sorted = {key: value for key, + value in sorted(query_sparse.items(), + key=lambda item: item[1],reverse=True)} + print("text expansion is enabled") + #print(query_sparse_sorted) + query_sparse_sorted_filtered = {} + + rank_features = [] + for key_ in query_sparse_sorted.keys(): + if(query_sparse_sorted[key_]>=st.session_state.input_sparse_filter): + feature = {"rank_feature": {"field": "product_description_sparse_vector."+key_,"boost":query_sparse_sorted[key_]}} + rank_features.append(feature) + query_sparse_sorted_filtered[key_]=query_sparse_sorted[key_] + else: + break + + #print(query_sparse_sorted_filtered) + sparse_payload = {"bool":{"should":rank_features}} + + ###### start of efficient filter applying ##### + if(st.session_state.input_rewritten_query!=""): + sparse_payload['bool']['must'] = filter_['filter']['bool']['must'] + + if(st.session_state.input_manual_filter == "True"): + sparse_payload['bool']['filter']=[] + if(st.session_state.input_category!=None): + sparse_payload['bool']['filter'].append({"term": {"category": st.session_state.input_category}}) + if(st.session_state.input_gender!=None): + sparse_payload['bool']['filter'].append({"term": {"gender_affinity": st.session_state.input_gender}}) + if(st.session_state.input_price!=(0,0)): + sparse_payload['bool']['filter'].append({"range": {"price": {"gte": st.session_state.input_price[0],"lte": st.session_state.input_price[1] }}}) + + +# print("sparse_payload**************") +# print(sparse_payload) + + + ###### end of efficient filter applying ##### + + + #print(sparse_payload) + + # sparse_payload = { + + # "neural_sparse": + # { + # "desc_embedding_sparse": + # { + # "query_text": query, + # "model_id": SAGEMAKER_SPARSE_MODEL_ID, + # #"max_token_score": 2 + # } + # } + + # } + + + hybrid_payload["query"]["hybrid"]["queries"].append(sparse_payload) + + + + + + + + + print("hybrid_payload") + print(st.session_state.re_ranker) + print("---------------") + docs = [] + + if(st.session_state.input_sql_query!=""): + url = host +"_plugins/_sql?format=json" + payload = {"query":st.session_state.input_sql_query} + r = requests.post(url, auth=awsauth, json=payload, headers=headers) + print("^^^^^") + print(r.text) + + if(len(hybrid_payload["query"]["hybrid"]["queries"])==1): + single_query = hybrid_payload["query"]["hybrid"]["queries"][0] + del hybrid_payload["query"]["hybrid"] + hybrid_payload["query"] = single_query + if(st.session_state.re_ranker == 'true' and st.session_state.input_reranker == 'Cross Encoder'): + path = "demostore-search-index/_search?search_pipeline=rerank_pipeline" + url = host + path + hybrid_payload["ext"] = {"rerank": { + "query_context": { + "query_text": query + } + }} + + print(hybrid_payload) + print(url) + r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers) + print(r.status_code) + print(r.text) + response_ = json.loads(r.text) + print("-------------------------------------------------------------------") + #print(response_) + docs = response_['hits']['hits'] + + + else: + + + print("hybrid_payload") + print(hybrid_payload) + print("-------------------------------------------------------------------") + + if( st.session_state.input_hybridType == "OpenSearch Hybrid Query"): + url_ = url + "?search_pipeline=hybrid_search_pipeline" + + if(st.session_state.re_ranker == 'true' and st.session_state.input_reranker == 'Cross Encoder'): + + url_ = url + "?search_pipeline=hybrid_rerank_pipeline" + + hybrid_payload["ext"] = {"rerank": { + "query_context": { + "query_text": query + } + }} + print(url_) + r = requests.get(url_, auth=awsauth, json=hybrid_payload, headers=headers) + print(r.status_code) + response_ = json.loads(r.text) + print("-------------------------------------------------------------------") + print(response_) + docs = response_['hits']['hits'] + + else: + all_docs = [] + all_docs_ids = [] + only_hits = [] + + rrf_hits = [] + for i,query in enumerate(hybrid_payload["query"]["hybrid"]["queries"]): + payload_ = {'_source': + {'exclude': ['desc_embedding_bedrock-multimodal', 'desc_embedding_bedrock-text', 'product_description_sparse_vector']}, + 'query': query, + 'size': k_, 'highlight': {'fields': {'product_description': {}}}} + + r_ = requests.get(url, auth=awsauth, json=payload_, headers=headers) + resp = json.loads(r_.text) + all_docs.append({"search":list(query.keys())[0],"results":resp['hits']['hits'],"weight":weights[i]}) + only_hits.append(resp['hits']['hits']) + for hit in resp['hits']['hits']: + all_docs_ids.append(hit["_id"]) + + + id_scores = [] + rrf_hits_unsorted = [] + + for id in all_docs_ids: + score = 0.0 + for result_set in all_docs: + if id in json.dumps(result_set['results']): + for n,res in enumerate(result_set['results']): + if(res["_id"] == id): + score += result_set["weight"] * (1.0 / (n+1)) + id_scores.append({"id":id,"score":score}) + for only_hit in only_hits: + for i_ in only_hit: + if(id == i_["_id"]): + i_["_score"] = score + rrf_hits_unsorted.append(i_) + print("rrf_hits_unsorted------------------------------") + docs = sorted(rrf_hits_unsorted, key=lambda x: x['_score'],reverse=True) + #print(docs) + + + + + arr = [] + dup = [] + for doc in docs: + if(doc['_source']['image_url'] not in dup): + res_ = { + "desc":doc['_source']['product_description'], + "caption":doc['_source']['caption'], + "image_url":doc['_source']['image_url'], + "category":doc['_source']['category'], + "price":doc['_source']['price'], + "gender_affinity":doc['_source']['gender_affinity'], + "style":doc['_source']['style'], + + } + if('highlight' in doc): + res_['highlight'] = doc['highlight']['product_description'] + if('NeuralSparse Search' in search_types): + res_['sparse'] = doc['_source']['product_description_sparse_vector'] + res_['query_sparse'] = query_sparse_sorted_filtered +# if(st.session_state.input_rekog_label !="" or st.session_state.input_is_rewrite_query == 'enabled'): +# res_['rekog'] = {'color':doc['_source']['rekog_color'],'category': doc['_source']['rekog_categories'],'objects':doc['_source']['rekog_objects']} + + res_['id'] = doc['_id'] + res_['score'] = doc['_score'] + res_['title'] = doc['_source']['product_description'] + + + arr.append(res_) + dup.append(doc['_source']['image_url']) + + #print(arr) + return arr[0:k_] + + + + diff --git a/semantic_search/amazon_rekognition.py b/semantic_search/amazon_rekognition.py new file mode 100644 index 0000000000000000000000000000000000000000..c1c1b372d0d5670afd57bb579c3398eed52cfb15 --- /dev/null +++ b/semantic_search/amazon_rekognition.py @@ -0,0 +1,155 @@ +import json +import os +import sys +import boto3 +from botocore.config import Config +import getpass +import os +import streamlit as st + + +from opensearchpy import OpenSearch, RequestsHttpConnection +aos_host = 'search-opensearchservi-75ucark0bqob-bzk6r6h2t33dlnpgx2pdeg22gi.us-east-1.es.amazonaws.com' + +auth = ("username","password") #### input credentials + +aos_client = OpenSearch( + hosts = [{'host': aos_host, 'port': 443}], + http_auth = auth, + use_ssl = True, + verify_certs = True, + connection_class = RequestsHttpConnection +) +rekog_client = boto3.client('rekognition', region_name='us-east-1') + +def extract_image_metadata(img): + res = rekog_client.detect_labels( + Features= [ "GENERAL_LABELS","IMAGE_PROPERTIES" ], + Image = { + + "Bytes":img + }, + MaxLabels = 10, + MinConfidence = 80.0, + Settings = { + # "GeneralLabels": { + # "LabelCategoryExclusionFilters": [ "string" ], + # "LabelCategoryInclusionFilters": [ "string" ], + # "LabelExclusionFilters": [ "string" ], + # "LabelInclusionFilters": [ "string" ] + # }, + "ImageProperties": { + "MaxDominantColors": 5 + } + } + ) + objects_category_color = {} + objects_category_color_masked = {} + + def add_span(x,type): + if(type == 'obj'): + return ""+x+"" + if(type == 'cat'): + return ""+x+"" + if(type == 'color'): + return ""+x+"" + + + for label in res['Labels']: + objects_category_color_masked[add_span(label['Name'],'obj')]={'categories':[],'color':""} + objects_category_color[label['Name']] = {'categories':[],'color':""} + if(len(label['Categories'])!=0): + for category in label['Categories']: + objects_category_color_masked[add_span(label['Name'],'obj')]['categories'].append(add_span(category['Name'].lower(),'cat')) + objects_category_color[label['Name']]['categories'].append(category['Name'].lower()) + + + if(len(label['Instances'])!=0): + for instance in label['Instances']: + if(len(instance['DominantColors'])!=0): + objects_category_color[label['Name']]['color'] = instance['DominantColors'][0]['SimplifiedColor'] + objects_category_color_masked[""+label['Name']+""]['color'] = add_span(instance['DominantColors'][0]['SimplifiedColor'],'color') + + st.session_state.input_rekog_directoutput = objects_category_color_masked + objects = [] + categories = [] + colors = [] + for key in objects_category_color.keys(): + if(key.lower() not in objects): + objects.append(key.lower()) + categories.append(" ".join(set(objects_category_color[key]['categories']))) + if(objects_category_color[key]['color']!=''): + colors.append(objects_category_color[key]['color'].lower()) + + objects = " ".join(set(objects)) + categories = " ".join(set(categories)) + colors = " ".join(set(colors)) + + print("^^^^^^^^^^^^^^^^^^") + print(colors+ " " + objects + " " + categories) + + return colors+ " " + objects + " " + categories + +def call(a,b): + print("'''''''''''''''''''''''") + print(b) + + if(st.session_state.input_is_rewrite_query == 'enabled' and st.session_state.input_rewritten_query!=""): + + + #st.session_state.input_rewritten_query['query']['bool']['should'].pop() + st.session_state.input_rewritten_query['query']['bool']['should'].append( { + "simple_query_string": { + + "query": a + " " + b, + "fields":['description','rekog_all^3'] + + } + }) + rekog_query = st.session_state.input_rewritten_query + + else: + rekog_query = { "query":{ + "simple_query_string": { + + "query": a + " " + b, + "fields":['description','rekog_all^3'] + + } + } + } + st.session_state.input_rewritten_query = rekog_query + + # response = aos_client.search( + # body = rekog_query, + # index = 'demo-retail-rekognition' + # #pipeline = 'RAG-Search-Pipeline' + # ) + + + # hits = response['hits']['hits'] + # print("rewrite-------------------------") + # arr = [] + # for doc in hits: + # # if('b5/b5319e00' in doc['_source']['image_s3_url'] ): + # # filter_out +=1 + # # continue + + # res_ = {"desc":doc['_source']['text'].replace(doc['_source']['metadata']['rekog_all']," ^^^ " +doc['_source']['metadata']['rekog_all']), + # "image_url":doc['_source']['metadata']['image_s3_url']} + # if('highlight' in doc): + # res_['highlight'] = doc['highlight']['text'] + # # if('caption_embedding' in doc['_source']): + # # res_['sparse'] = doc['_source']['caption_embedding'] + # # if('query_sparse' in response_ and len(arr) ==0 ): + # # res_['query_sparse'] = response_["query_sparse"] + # res_['id'] = doc['_id'] + # res_['score'] = doc['_score'] + # res_['title'] = doc['_source']['text'] + # res_['rekog'] = {'color':doc['_source']['metadata']['rekog_color'],'category': doc['_source']['metadata']['rekog_categories'],'objects':doc['_source']['metadata']['rekog_objects']} + + # arr.append(res_) + + + + # return arr \ No newline at end of file diff --git a/semantic_search/llm_eval.py b/semantic_search/llm_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..47b8547b4efca446b257cd7e70cf4624d6467b36 --- /dev/null +++ b/semantic_search/llm_eval.py @@ -0,0 +1,149 @@ +import os +import io +import sys +sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities") +import json +import glob +import boto3 +import base64 +import logging +import requests +import numpy as np +import pandas as pd +from PIL import Image +from typing import List +from botocore.auth import SigV4Auth +from langchain.llms.bedrock import Bedrock +from botocore.awsrequest import AWSRequest +import streamlit as st +import re +import numpy as np +from sklearn.metrics import ndcg_score,dcg_score +from sklearn import preprocessing as pre +import invoke_models + +bedrock_ = boto3.client( + 'bedrock-runtime', + aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1' +) + +inference_modifier = { + "max_tokens_to_sample": 4096, + "temperature": 0, + "top_k": 250, + "top_p": 1, + "stop_sequences": ["\n\nHuman"], +} +textgen_llm = Bedrock( + model_id="anthropic.claude-v2:1", + client=bedrock_, + model_kwargs=inference_modifier, +) + + +#@st.cache_data +def eval(question, answers): + #if() + search_results: str = "" + prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n + The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n + It does not need to be a stringent test. The goal is to filter out totally irrelevant product retrievals. \n + If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. + + + {} + + + + {} + + + Now based on the information provided above, for every given Retrieved document, provide the index of the document, it's score out of 5 based on relevance with the User question, is it relevant or not as true or false, reason why you this is relevant or not, in json format, + + Answer: + """ + #Finally, as the last line of your response, write the relevant indexes as a comma separated list in a line. + + + query = question[0]['question'] + index_ = 0 + for i in answers[0]['answer']: + desc = i['caption']+ "."+ i['desc'] + search_results += f"Index: {index_}, Description: {desc}\n\n" + index_ = index_+1 + prompt = prompt.format(query, search_results) + # print(answers[0]['answer']) + # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") + # print(prompt) + + response = textgen_llm(prompt) + #invoke_models.invoke_llm_model(prompt,False) + print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") + print(response) + inter_trim =response.split("[")[1] + final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}') + #final_out_sorted_desc = sorted(final_out['results'], key=lambda d: d['Score'],reverse=True) + # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") + # print(final_out_sorted_desc) + + #true_relevance = np.asarray([[10, 0, 0, 1, 5]]) + llm_scores = [] + current_scores = [] + for idx,i in enumerate(answers[0]['answer']): + if('relevant' in final_out['results'][idx]): + relevance = final_out['results'][idx]['relevant'] + else: + relevance = final_out['results'][idx]['Relevant'] + if('score' in final_out['results'][idx]): + score_ = final_out['results'][idx]['score'] + else: + score_ = final_out['results'][idx]['Score'] + i['relevant'] = relevance + llm_scores.append(score_) + current_scores.append(i['score']) + + + + # llm_scores.sort(reverse = True) + x = np.array(llm_scores) + x = x.reshape(-1, 1) + x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist() + + y = np.array(current_scores) + y = y.reshape(-1, 1) + y_norm = (pre.MinMaxScaler().fit_transform(y)).flatten().tolist() + + + st.session_state.answers = answers + + # print(x_norm) + # print(y_norm) + + dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores])) + # print("DCG score : ", dcg) + + # IDCG score + idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores])) + # print("IDCG score : ", idcg) + + # Normalized DCG score + ndcg = dcg + + # print(st.session_state.input_ndcg) + # if(st.session_state.input_previous_query!=""): + # if(st.session_state.input_previous_query == st.session_state.input_text): + # st.session_state.input_ndcg=0.0 + if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0): + st.session_state.ndcg_increase = "↑~"+str('%.3f'%(ndcg-st.session_state.input_ndcg )) + elif(ndcg < st.session_state.input_ndcg): + st.session_state.ndcg_increase = "↓~"+str('%.3f'%(st.session_state.input_ndcg - ndcg)) + else: + st.session_state.ndcg_increase = " ~ " + + + + st.session_state.input_ndcg = ndcg#round(ndcg_score(np.asarray([x_norm]), np.asarray([y_norm]), k=st.session_state.input_K),2) + print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") + print(st.session_state.input_ndcg) + \ No newline at end of file diff --git a/semantic_search/query_rewrite.py b/semantic_search/query_rewrite.py new file mode 100644 index 0000000000000000000000000000000000000000..de04bb8d760db0549c37ce9594485e0c3caf4273 --- /dev/null +++ b/semantic_search/query_rewrite.py @@ -0,0 +1,449 @@ +import json +import os +import sys +import boto3 +import amazon_rekognition +from botocore.config import Config +import getpass +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize +import os +import streamlit as st +from langchain.schema import Document +#from langchain_community.vectorstores import OpenSearchVectorSearch,ElasticsearchStore +from requests_aws4auth import AWS4Auth +from requests.auth import HTTPBasicAuth +from langchain.chains.query_constructor.base import ( + StructuredQueryOutputParser, + get_query_constructor_prompt, +) +from langchain.retrievers.self_query.opensearch import OpenSearchTranslator +from langchain.chains import ConversationChain +from langchain.llms.bedrock import Bedrock +from langchain.memory import ConversationBufferMemory +from langchain.chains.query_constructor.base import AttributeInfo +from langchain.retrievers.self_query.base import SelfQueryRetriever +from langchain_core.prompts.few_shot import FewShotPromptTemplate +from langchain_core.prompts.prompt import PromptTemplate +from langchain.embeddings import BedrockEmbeddings +#from langchain.vectorstores import OpenSearchVectorSearch +from opensearchpy import OpenSearch, RequestsHttpConnection +import utilities.invoke_models as invoke_models + + + +bedrock_params = { + "max_tokens_to_sample":2048, + "temperature":0.0001, + "top_k":250, + "top_p":1, + "stop_sequences":["\\n\\nHuman:"] +} +bedrock_region="us-east-1" + +#boto3_bedrock = boto3.client(service_name="bedrock-runtime", endpoint_url=f"https://bedrock-runtime.{bedrock_region}.amazonaws.com") +boto3_bedrock = boto3.client(service_name="bedrock-runtime", config=Config(region_name=bedrock_region)) + +bedrock_titan_llm = Bedrock(model_id="anthropic.claude-instant-v1", client=boto3_bedrock) +bedrock_titan_llm.model_kwargs = bedrock_params +bedrock_embeddings = BedrockEmbeddings(model_id='amazon.titan-embed-text-v1',client=boto3_bedrock) + + +schema = """{{ + "content": "Brief summary of a retail product", + "attributes": {{ + "category": {{ + "description": "The category of the product, the available categories are apparel, footwear, outdoors, electronics, beauty, jewelry, accessories, housewares, homedecor, furniture, seasonal, floral, books, groceries, instruments, tools, hot dispensed, cold dispensed, food service and salty snacks", + "type": "string" + }}, + "gender_affinity": {{ + "description": "The gender that the product relates to, the choices are Male and Female", + "type": "string" + }}, + "price": {{ + "description": "Cost of the product", + "type": "double" + }}, + "description": {{ + "description": "The detailed description of the product", + "type": "string" + }}, + "color": {{ + "description": "The color of the product", + "type": "string" + }}, + "caption": {{ + "description": "The short description of the product", + "type": "string" + }}, + "current_stock": {{ + "description": "The available quantity of the product in stock for sale", + "type": "integer" + }}, + "style": {{ + "description": "The style of the product", + "type": "string" + }} +}} +}}""" +metadata_field_info_ = [ + AttributeInfo( + name="price", + description="Cost of the product", + type="string", + ), + AttributeInfo( + name="style", + description="The style of the product", + type="string", + ), + AttributeInfo( + name="category", + description="The category of the product, the available categories are apparel, footwear, outdoors, electronics, beauty, jewelry, accessories, housewares, homedecor, furniture, seasonal, floral, books, groceries, instruments, tools, hot dispensed, cold dispensed, food service and salty snacks", + type="string", + ), + AttributeInfo( + name="current_stock", + description="The available quantity of the product", + type="string", + ), + AttributeInfo( + name="gender_affinity", + description="The gender that the product relates to, the choices are Male and Female", + type="string" + ), + AttributeInfo( + name="caption", + description="The short description of the product", + type="string" + ), + AttributeInfo( + name="description", + description="The detailed description of the product", + type="string" + ), + AttributeInfo( + name="color", + description="The color of the product", + type="string" + ) +] +document_content_description_ = "Brief summary of a retail product" + +# open_search_vector_store = OpenSearchVectorSearch( +# index_name="retail-ml-search-index",#"self-query-rewrite-retail", +# embedding_function=bedrock_embeddings, +# opensearch_url=os_domain_ep, +# http_auth=auth +# ) + +examples = [ + { "i":1, + "data_source": schema, + "user_query": "black shoes for men", + "structured_request": """{{ + "query": "shoes", + "filter": "and(eq(\"color\", \"black\"), eq(\"category\", \"footwear\")), eq(\"gender_affinity\", \"Male\")" + }}""", + }, + + { "i":2, + "data_source": schema, + "user_query": "black or brown jackets for men under 50 dollars", + "structured_request": """{{ + "query": "jackets", + "filter": "and(eq(\"style\", \"jacket\"), or(eq(\"color\", \"brown\"),eq(\"color\", \"black\")),eq(\"category\", \"apparel\"),eq(\"gender_affinity\", \"male\"),lt(\"price\", \"50\"))" + }}""", + }, + { "i":2, + "data_source": schema, + "user_query": "trendy handbags for women", + "structured_request": """{{ + "query": "handbag", + "filter": "and(eq(\"style\", \"bag\") ,eq(\"category\", \"accessories\"),eq(\"gender_affinity\", \"female\"))" + }}""", + } +] + + +example_prompt = PromptTemplate( + input_variables=["question", "answer"], template="Question: {question}\n{answer}" +) +example_prompt=PromptTemplate(input_variables=['data_source', 'i', 'structured_request', 'user_query'], +template='<< Example {i}. >>\nData Source:\n{data_source}\n\nUser Query:\n{user_query}\n\nStructured Request:\n{structured_request}\n') +#print(example_prompt.format(**examples[0])) + +prefix_ = """ +Your goal is to structure the user's query to match the request schema provided below. + +<< Structured Request Schema >> +When responding use a markdown code snippet with a JSON object formatted in the following schema: + +```json +{{ + "query": string \ text string to compare to document contents + "filter": string \ logical condition statement for filtering documents +}} +``` + +The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well. + +A logical condition statement is composed of one or more comparison and logical operation statements. + +A comparison statement takes the form: `comp(attr, val)`: +- `comp` (eq | ne | gt | gte | lt | lte | contain | like | in | nin): comparator +- `attr` (string): name of attribute to apply the comparison to +- `val` (string): is the comparison value + +A logical operation statement takes the form `op(statement1, statement2, ...)`: +- `op` (and | or | not): logical operator +- `statement1`, `statement2`, ... (comparison statements or logical operation statements): one or more statements to apply the operation to + +Make sure that you only use the comparators and logical operators listed above and no others. +Make sure that filters only refer to attributes that exist in the data source. +Make sure that filters only use the attributed names with its function names if there are functions applied on them. +Make sure that filters only use format `YYYY-MM-DD` when handling date data typed values. +Make sure that filters take into account the descriptions of attributes and only make comparisons that are feasible given the type of data being stored. +Make sure that filters are only used as needed. If there are no filters that should be applied return "NO_FILTER" for the filter value. +""" + +suffix_ = """<< Example 3. >> +Data Source: +{schema} + +User Query: +{query} + +Structured Request: +""" + +prompt_ = FewShotPromptTemplate( + examples=examples, + example_prompt=example_prompt, + suffix=suffix_, + prefix=prefix_, + input_variables=["query","schema"], +) + + + + + +# retriever = SelfQueryRetriever.from_llm( +# bedrock_titan_llm, open_search_vector_store, document_content_description_, metadata_field_info_, verbose=True +# ) + +# res = retriever.get_relevant_documents("bagpack for men") + +# st.write(res) + +######### use this for self query retriever ######## +# prompt = get_query_constructor_prompt( +# document_content_description_, +# metadata_field_info_, +# ) +# output_parser = StructuredQueryOutputParser.from_components() +# query_constructor = prompt | bedrock_titan_llm | output_parser + +def get_new_query_res(query): + field_map = {'Price':'price','Gender':'gender_affinity','Category':'category','Style':'style','Color':'color'} + field_map_filter = {key: field_map[key] for key in st.session_state.input_must} + if(query == ""): + query = st.session_state.input_rekog_label + if(st.session_state.input_is_rewrite_query == 'enabled'): + + # query_struct = query_constructor.invoke( + # { + # "query": query + # } + # ) + # print("***prompt****") + # print(prompt) + # print("******query_struct******") + # print(query_struct) + + res = invoke_models.invoke_llm_model( prompt_.format(query=query,schema = schema) ,False) + inter_query = res[7:-3].replace('\\"',"'").replace("\n","") + print("inter_query") + print(inter_query) + query_struct = StructuredQueryOutputParser.from_components().parse(inter_query) + print("query_struct") + print(query_struct) + opts = OpenSearchTranslator() + result_query_llm = opts.visit_structured_query(query_struct)[1]['filter'] + print(result_query_llm) + draft_new_query = {'bool':{'should':[],'must':[]}} + if('bool' in result_query_llm and ('must' in result_query_llm['bool'] or 'should' in result_query_llm['bool'])): + #draft_new_query['bool']['should'] = [] + if('must' in result_query_llm['bool']): + for q in result_query_llm['bool']['must']: + old_clause = list(q.keys())[0] + if(old_clause == 'term'): + new_clause = 'match' + else: + new_clause = old_clause + q_dash = {} + q_dash[new_clause] = {} + long_field = list(q[old_clause].keys())[0] + #print(long_field) + get_attr = long_field.split(".")[1] + #print(get_attr) + q_dash[new_clause][get_attr] = q[old_clause][long_field] + #print(q_dash) + if(get_attr in list(field_map_filter.values())): + draft_new_query['bool']['must'].append(q_dash) + else: + draft_new_query['bool']['should'].append(q_dash) + # if('should' in result_query_llm['bool']): + # for q_ in result_query_llm['bool']['must']: + # q__dash = json.loads(json.dumps(q_).replace('term','match' )) + # clause = list(q__dash.keys())[0]category + # long_field = list(q__dash[clause].keys())[0] + # get_attr = long_field.split(".")[1] + # q__dash[clause][get_attr] = q__dash[clause][long_field] + # draft_new_query['bool']['should'].append(q__dash) + + #print(draft_new_query) + query_ = draft_new_query#json.loads(json.dumps(opts.visit_structured_query(query_struct)[1]['filter']).replace("must","should"))#.replace("must","should") + + # if('bool' in query_ and 'should' in query_['bool']): + # query_['bool']['should'].append({ + # "match": { + + # "rekog_description_plus_original_description": query + + # } + # }) + # else: + # query_['bool']['should'] = { + # "match": { + + # "rekog_description_plus_original_description": query + + # } + # } + + # def find_by_key(data, target): + # for key, value in data.items(): + # if isinstance(value, dict): + # yield from find_by_key(value, target) + # elif key == target: + # yield value + # for x in find_by_key(query_, "metadata.category.keyword"): + # imp_item = x + + + ###### find the main subject of the query + #imp_item = "" + # if("bool" in query_ and 'should' in query_['bool']): + # for i in query_['bool']['should']: + # if("term" in i.keys()): + # if("metadata.category.keyword" in i["term"]): + # imp_item = imp_item + i["term"]["metadata.category.keyword"]+ " " + # if("metadata.style.keyword" in i["term"]): + # imp_item = imp_item + i["term"]["metadata.style.keyword"]+ " " + # if("match" in i.keys()): + # if("metadata.category.keyword" in i["match"]): + # imp_item = imp_item + i["match"]["metadata.category.keyword"]+ " " + # if("metadata.style.keyword" in i["match"]): + # imp_item = imp_item + i["match"]["metadata.style.keyword"]+ " " + # else: + # if("term" in query_): + # if("metadata.category.keyword" in query_): + # imp_item = imp_item + query_["metadata.category.keyword"] + " " + # if("metadata.style.keyword" in query_): + # imp_item = imp_item + query_["metadata.style.keyword"]+ " " + # if("match" in query_): + # if("metadata.category.keyword" in query_): + # imp_item = imp_item + query_["metadata.category.keyword"]+ " " + # if("metadata.style.keyword" in query_): + # imp_item = imp_item + query_["metadata.style.keyword"]+ " " + ###### find the main subject of the query + imp_item = (opts.visit_structured_query(query_struct)[0]).replace(",","") + + + if(imp_item == ""): + imp_item = query + + ps = PorterStemmer() + # def stem_(sentence): + # words = word_tokenize(sentence) + + # words_stem = "" + + # for w in words: + # words_stem = words_stem +" "+ps.stem(w) + # return words_stem.strip() + + #imp_item = stem_(imp_item) + print("imp_item---------------") + print(imp_item) + if('must' in query_['bool']): + query_['bool']['must'].append({ + "simple_query_string": { + + "query": imp_item.strip(), + "fields":['description',"style","caption"]#'rekog_all^3' + + } + #"match":{"description":imp_item.strip()} + }) + else: + query_['bool']['must']={ + "multi_match": { + + "query": imp_item.strip(), + "fields":['description',"style"]#'rekog_all^3' + + } + #"match":{"description":imp_item.strip()} + } + + + #query_['bool']["minimum_should_match"] = 1 + + st.session_state.input_rewritten_query = {"query":query_} + print(st.session_state.input_rewritten_query) + # if(st.session_state.input_rekog_label!="" and query!=st.session_state.input_rekog_label): + # amazon_rekognition.call(st.session_state.input_text,st.session_state.input_rekog_label) + + + # #return searchWithNewQuery(st.session_state.input_rewritten_query) + +# def searchWithNewQuery(new_query): +# response = aos_client.search( +# body = new_query, +# index = "demo-retail-rekognition"#'self-query-rewrite-retail', +# #pipeline = 'RAG-Search-Pipeline' +# ) + +# hits = response['hits']['hits'] +# print("rewrite-------------------------") +# arr = [] +# for doc in hits: +# # if('b5/b5319e00' in doc['_source']['image_s3_url'] ): +# # filter_out +=1 +# # continue + +# res_ = {"desc":doc['_source']['text'],"image_url":doc['_source']['metadata']['image_s3_url']} +# if('highlight' in doc): +# res_['highlight'] = doc['highlight']['text'] +# # if('caption_embedding' in doc['_source']): +# # res_['sparse'] = doc['_source']['caption_embedding'] +# # if('query_sparse' in response_ and len(arr) ==0 ): +# # res_['query_sparse'] = response_["query_sparse"] +# res_['id'] = doc['_id'] +# res_['score'] = doc['_score'] +# res_['title'] = doc['_source']['text'] + +# arr.append(res_) + + + +# return arr + + + + + + diff --git a/split_pdf/ukhousingstats0.pdf b/split_pdf/ukhousingstats0.pdf new file mode 100644 index 0000000000000000000000000000000000000000..07d08a864517af5f8c9a533bdf0b20d66e8e9012 Binary files /dev/null and b/split_pdf/ukhousingstats0.pdf differ diff --git a/split_pdf/ukhousingstats1.pdf b/split_pdf/ukhousingstats1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..97c106283a177dce686fc30e46b2798f550188b9 Binary files /dev/null and b/split_pdf/ukhousingstats1.pdf differ diff --git a/split_pdf/ukhousingstats2.pdf b/split_pdf/ukhousingstats2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..750e80d1493bc2498f2c054fc52317e2a9efe0d5 Binary files /dev/null and b/split_pdf/ukhousingstats2.pdf differ diff --git a/split_pdf/ukhousingstats3.pdf b/split_pdf/ukhousingstats3.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3c9feb29a13c7d2b4959b4deaca89911d0a7e43d Binary files /dev/null and b/split_pdf/ukhousingstats3.pdf differ diff --git a/split_pdf/ukhousingstats4.pdf b/split_pdf/ukhousingstats4.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bee33b6a82dd0233179557fa21a0ba7a70b9d667 Binary files /dev/null and b/split_pdf/ukhousingstats4.pdf differ diff --git a/split_pdf/ukhousingstats5.pdf b/split_pdf/ukhousingstats5.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5daae571c124452e8faf49147672e1467f1f9a9d Binary files /dev/null and b/split_pdf/ukhousingstats5.pdf differ diff --git a/split_pdf_csv/ukhousingstats2.csv b/split_pdf_csv/ukhousingstats2.csv new file mode 100644 index 0000000000000000000000000000000000000000..711ce696560d82014567ceb22f045b17695f35f7 --- /dev/null +++ b/split_pdf_csv/ukhousingstats2.csv @@ -0,0 +1,15 @@ +`` +13x Current house price to`London and the East of England have led the rebound in new buyer demand in the first weeks of 2024. Most other areas recorded below- average increases in demand, typically rising in line with last year or only ahead by single digits.` +earnings ratio in London`The rebound in London is uniform across the market segments - inner- London, suburban outer-London and the core commuter areas around London. This could reflect a turn of fortunes for the London housing market. Over the last seven years, the city has lagged behind the rest of the UK in terms of sales volumes and house price inflation.` +`Our house price index shows that London house prices have risen just 13% since the start of 2016. Meanwhile, they are 34% higher across the UK and almost 50% higher in Wales. The average value of a flat in London is just 2% higher over the same period.` +`Fast house price appreciation in the early 2010's saw London reach 'peak unaffordability' in 2016 with a price to earnings ratio of over 15x. A succession of factors has subsequently hit demand and pricing in the capital e.g. tax changes aimed at investors and overseas buyers, the Brexit vote, which hit jobs growth and a global pandemic that closed cities to travel and changed working patterns. This, combined with higher mortgage rates which have hit the most expensive housing markets hardest.` +`Low house price inflation since 2016 and rising earnings means housing affordability in London, measured on a house price to earnings ratio basis, is at its lowest since 2014. However, London housing prices remain expensive by UK standards at 13x earnings.` +`Slowly improving housing affordability in London is positive news but home buyers still face a sizable affordability challenge with mortgage rates doubling since 2021. We expect market conditions in London to continue to improve over "2024," with earnings rising faster than house prices. This will continue to improve affordability and support levels of housing sales rather than boost house prices.` +`Housing affordability improves but wide north-south variation remains 16 14 12 10 8 6 4` +`2 London South excl London Midlands North Wales Scotland o` +`2003 2005 2007 2009 2011 2013 2015 2017 2019 2021 2023 Source: Zoopla House Price Index and ONS ASHE Single full time earner income used` + + + + + diff --git a/split_pdf_csv/ukhousingstats4.csv b/split_pdf_csv/ukhousingstats4.csv new file mode 100644 index 0000000000000000000000000000000000000000..5a1d98a9af41c5539c5456d964990f39aa1fce59 --- /dev/null +++ b/split_pdf_csv/ukhousingstats4.csv @@ -0,0 +1,25 @@ +City`Annual in`% change house price` +1`Belfast`4.0%` +2`Glasgow`1.5%` +3`Edinburgh`1.3%` +4`Liverpool`0.9%` +5`Leeds`0.6%` +6`Newcastle`0.5%` +7`Sheffield`0.4%` +8`Manchester`0.4%` +9`Cardiff`0.1%` +10`Birmingham`-0.1%` +11`Nottingham`-0.6%` +12`Oxford`-0.6%` +13`Bristol`-1.5%` +14`Leicester`-1.8%` +15`Bournemouth`-1.9%` +16`Cambridge`-1.9%` +17`Portsmouth`-2.3%` +18`Southampton`-2.7%` +19`Aberdeen`-2.9%` + + + + + diff --git a/split_pdf_csv/ukhousingstats5.csv b/split_pdf_csv/ukhousingstats5.csv new file mode 100644 index 0000000000000000000000000000000000000000..ca8fbdfd40204c0dd3aca4315e0ce06ea9a92f67 --- /dev/null +++ b/split_pdf_csv/ukhousingstats5.csv @@ -0,0 +1,28 @@ +`Average price`%YoY Dec-23`%YoY Dec-22`Monthly trend`Annual trend` +United Kingdom`£264,400`-0.8%`6.1%``` +20 City Composite`£304,400`-0.7%`5.4%``` +Belfast`£170,200`4.0%`3.2%``` +Glasgow`£146,300`1.5%`4.4%``` +Edinburgh`£269,900`1.3%`3.8%``` +Liverpool`£157,400`0.9%`7.6%``` +Leeds`£209,000`0.6%`6.9%``` +Newcastle`£153,000`0.5%`6.4%``` +Manchester`£223,000`0.4%`7.8%``` +Sheffield`£172,000`0.4%`7.5%``` +Cardiff`£252,600`0.1%`6.8%``` +Birmingham`£208,100`-0.1%`7.9%``` +Nottingham`£201,800`-0,6%`9.1%``` +Oxford`£444,500`-0.6%`3.5%``` +London`£536,800`-1.1%`2.4%``` +Bristol`£338,300`-1.5%`7.1%``` +Leicester`£225,400`-1.8%`8.4%``` +Bournemouth`£334,300`-1.9%`5.9%``` +Cambridge`£470,100`-1.9%`4.1%``` +Portsmouth`£278,300`-2.3%`7.5%``` +Southampton`£257,400`-2.7%`6.8%``` +Aberdeen`£139,900`-2.9%`-0.7%``` + + + + + diff --git a/uploaded_images/red sneakers with black laces-resized.jpg b/uploaded_images/red sneakers with black laces-resized.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0cc958d1985ecbea620e7d91b6b39b748c1d19d1 Binary files /dev/null and b/uploaded_images/red sneakers with black laces-resized.jpg differ diff --git a/uploaded_images/red sneakers with black laces-resized_display.jpg b/uploaded_images/red sneakers with black laces-resized_display.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0a667b4807fc869403c146167d4b6f64a00d2a43 Binary files /dev/null and b/uploaded_images/red sneakers with black laces-resized_display.jpg differ diff --git a/uploaded_images/red sneakers with black laces.jpg b/uploaded_images/red sneakers with black laces.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4f5ced2fbd2f22dc8d2969eb5767b8a8df747240 Binary files /dev/null and b/uploaded_images/red sneakers with black laces.jpg differ diff --git a/utilities/colbert.py b/utilities/colbert.py new file mode 100644 index 0000000000000000000000000000000000000000..da5bbf3395ff75b966fc8645c2eed06b13d43a98 --- /dev/null +++ b/utilities/colbert.py @@ -0,0 +1,101 @@ +from transformers import AutoTokenizer, AutoModel +import torch +import torch.nn.functional as F +import numpy as np +import streamlit as st + +#Mean Pooling - Take attention mask into account for correct averaging +def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] #First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + + +# Load model from HuggingFace Hub +tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') +model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') +f = open("/home/ubuntu/AI-search-with-amazon-opensearch-service/OpenSearchApp/utilities/colbert_vocab.txt", "r") +vocab = f.read() +vocab_arr = vocab.split("\n") +vocab_arr +vocab_dict={} +for index,n in enumerate(vocab_arr): + vocab_dict[str(index)]=n + + + +def vectorise(sentence,token_level_vectors): + print("-------------colbert ---- 2-----------") + # Tokenize sentences + encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt') + # Compute token embeddings + with torch.no_grad(): + model_output = model(**encoded_input) + if(token_level_vectors): + return encoded_input['input_ids'][0].tolist(),model_output['last_hidden_state'][0] + + # Perform pooling + sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) + + # Normalize embeddings + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + + return sentence_embeddings[0].tolist() + +def search(hits): + print("-------------COLBERT------------4------------------------------------------") + token_ids,token_vectors = vectorise(st.session_state.input_text,True) + final_docs = [] + for ind,j in enumerate(hits): + max_score_dict_list = [] + doc={"_source": + { + "description":j["_source"]["description"],"caption":j["_source"]["title"], + "image_s3_url":j["_source"]["image_s3_url"],"price":j["_source"]["price"], + "style":j["_source"]["style"],"category":j["_source"]["category"]},"_id":j["_id"],"_score":j["_score"]} + + if("gender_affinity" in j["_source"]): + doc["_source"]["gender_affinity"] = j["_source"]["gender_affinity"] + else: + doc["_source"]["gender_affinity"] = "" + #print(j["_source"]["title"]) + source_doc_token_keys = list(j["_source"].keys()) + with_s = [x for x in source_doc_token_keys if x.startswith("description-token-")] + add_score = 0 + + for index,i in enumerate(token_vectors): + token = vocab_dict[str(token_ids[index])] + if(token!='[SEP]' and token!='[CLS]'): + query_token_vector = np.array(i) + print("query token: "+token) + print("-----------------") + scores = [] + for m in with_s: + m_arr = m.split("-") + if(m_arr[-1]!='[SEP]' and m_arr[-1]!='[CLS]'): + #print("document token: "+m_arr[3]) + doc_token_vector = np.array(j["_source"][m]) + score = np.dot(query_token_vector,doc_token_vector) + scores.append({"doc_token":m_arr[3],"score":score}) + #print({"doc_token":m_arr[3],"score":score}) + + newlist = sorted(scores, key=lambda d: d['score'], reverse=True) + max_score = newlist[0]['score'] + add_score+=max_score + max_score_dict_list.append(newlist[0]) + print(newlist[0]) + max_score_dict_list_sorted = sorted(max_score_dict_list, key=lambda d: d['score'], reverse=True) + print(max_score_dict_list_sorted) + # print(add_score) + doc["total_score"] = add_score + doc['max_score_dict_list_sorted'] = max_score_dict_list_sorted + final_docs.append(doc) + final_docs_sorted = sorted(final_docs, key=lambda d: d['total_score'], reverse=True) + print("-------------COLBERT-----final--------") + print(final_docs_sorted) + return final_docs_sorted + + + + + diff --git a/utilities/invoke_models.py b/utilities/invoke_models.py new file mode 100644 index 0000000000000000000000000000000000000000..ac822ad3572452aa8a51c8c57d245f36c7e31fdc --- /dev/null +++ b/utilities/invoke_models.py @@ -0,0 +1,237 @@ +import boto3 +import json +#from IPython.display import clear_output, display, display_markdown, Markdown +import pandas as pd +from langchain.agents.agent_types import AgentType +from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent +from langchain_core.prompts import ChatPromptTemplate +from langchain_community.chat_models import BedrockChat +import streamlit as st +#from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer +#import torch + +region = 'us-east-1' +bedrock_runtime_client = boto3.client( + 'bedrock-runtime', + aws_access_key_id=st.secrets['user_access_key'], + aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1' +) + + + +# def generate_image_captions_ml(): +# model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +# feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +# tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") + +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +# model.to(device) +# max_length = 16 +# num_beams = 4 +# gen_kwargs = {"max_length": max_length, "num_beams": num_beams} + +def invoke_model(input): + response = bedrock_runtime_client.invoke_model( + body=json.dumps({ + 'inputText': input + }), + modelId="amazon.titan-embed-text-v1", + accept="application/json", + contentType="application/json", + ) + + response_body = json.loads(response.get("body").read()) + return response_body.get("embedding") + +def invoke_model_mm(text,img): + body_ = { + "inputText": text, + + } + if(img!='none'): + body_['inputImage']=img + + body = json.dumps(body_) + + modelId = 'amazon.titan-embed-image-v1' + accept = 'application/json' + contentType = "application/json" + + response = bedrock_runtime_client.invoke_model( + body=body, modelId=modelId, accept=accept, contentType=contentType + ) + response_body = json.loads(response.get("body").read()) + #print(response_body) + return response_body.get("embedding") + +def invoke_llm_model(input,is_stream): + if(is_stream == False): + response = bedrock_runtime_client.invoke_model( + modelId= "anthropic.claude-3-sonnet-20240229-v1:0",#"anthropic.claude-3-5-sonnet-20240620-v1:0",, + contentType = "application/json", + accept = "application/json", + + body = json.dumps({ + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 1024, + "temperature": 0.001, + "top_k": 250, + "top_p": 1, + "stop_sequences": [ + "\n\nHuman:" + ], + "messages": [ + { + "role": "user", + "content":input + } + ] + } + + ) + ) + + res = (response.get('body').read()).decode() + + return (json.loads(res))['content'][0]['text'] + + # response = bedrock_runtime_client.invoke_model_with_response_stream( + # body=json.dumps({ + # "prompt": input, + # "max_tokens_to_sample": 300, + # "temperature": 0.5, + # "top_k": 250, + # "top_p": 1, + # "stop_sequences": [ + # "\n\nHuman:" + # ], + # # "anthropic_version": "bedrock-2023-05-31" + # }), + # modelId="anthropic.claude-v2:1", + # accept="application/json", + # contentType="application/json", + # ) + # stream = response.get('body') + + # return stream + + # else: + # response = bedrock_runtime_client.invoke_model_with_response_stream( + # modelId= "anthropic.claude-3-sonnet-20240229-v1:0", + # contentType = "application/json", + # accept = "application/json", + + # body = json.dumps({ + # "anthropic_version": "bedrock-2023-05-31", + # "max_tokens": 1024, + # "temperature": 0.0001, + # "top_k": 150, + # "top_p": 0.7, + # "stop_sequences": [ + # "\n\nHuman:" + # ], + # "messages": [ + # { + # "role": "user", + # "content":input + # } + # ] + # } + + # ) + # ) + + # stream = response.get('body') + + # return stream + +def read_from_table(file,question): + print("started table analysis:") + print("-----------------------") + print("\n\n") + print("Table name: "+file) + print("-----------------------") + print("\n\n") + bedrock_params = { + "max_tokens":2048, + "temperature":0.0001, + "top_k":150, + "top_p":0.7, + "stop_sequences":["\\n\\nHuman:"] + } + + model = BedrockChat( + client=bedrock_runtime_client, + model_id='anthropic.claude-3-sonnet-20240229-v1:0', + model_kwargs=bedrock_params, + streaming=False + ) + if(str(type(file))==""): + df = pd.read_csv(file,skipinitialspace = True, on_bad_lines='skip',delimiter = "`") + else: + df = file + #df.fillna(method='pad', inplace=True) + agent = create_pandas_dataframe_agent( + model, + df, + verbose=True, + agent_executor_kwargs={'handle_parsing_errors':True, + 'return_only_outputs':True},allow_dangerous_code = True + ) + agent_res = agent.invoke(question)['output'] + return agent_res + +def generate_image_captions_llm(base64_string,question): + + # ant_client = Anthropic() + # MODEL_NAME = "claude-3-opus-20240229" + + # message_list = [ + # { + # "role": 'user', + # "content": [ + # {"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": base64_string}}, + # {"type": "text", "text": "What is in the image ?"} + # ] + # } + # ] + + # response = ant_client.messages.create( + # model=MODEL_NAME, + # max_tokens=2048, + # messages=message_list + # ) + response = bedrock_runtime_client.invoke_model( + modelId= "anthropic.claude-3-sonnet-20240229-v1:0", + contentType = "application/json", + accept = "application/json", + + body = json.dumps({ + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": base64_string + } + }, + { + "type": "text", + "text": question + } + ] + } + ] + })) + #print(response) + response_body = json.loads(response.get("body").read())['content'][0]['text'] + + #print(response_body) + + return response_body \ No newline at end of file diff --git a/utilities/re_ranker.py b/utilities/re_ranker.py new file mode 100644 index 0000000000000000000000000000000000000000..461df6318fc0c710084de1224d5f0fc3d3620b8e --- /dev/null +++ b/utilities/re_ranker.py @@ -0,0 +1,158 @@ +import boto3 +from botocore.exceptions import ClientError +import pprint +import time +import streamlit as st +from sentence_transformers import CrossEncoder + +model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512) +kendra_ranking = boto3.client("kendra-ranking",region_name = 'us-east-1') + + +print("Create a rescore execution plan.") + +# Provide a name for the rescore execution plan +name = "MyRescoreExecutionPlan" +# Set your required additional capacity units +# Don't set capacity units if you don't require more than 1 unit given by default +capacity_units = 2 + +# try: +# rescore_execution_plan_response = kendra_ranking.create_rescore_execution_plan( +# Name = name, +# CapacityUnits = {"RescoreCapacityUnits":capacity_units} +# ) + +# pprint.pprint(rescore_execution_plan_response) + +# rescore_execution_plan_id = rescore_execution_plan_response["Id"] + +# print("Wait for Amazon Kendra to create the rescore execution plan.") + +# while True: +# # Get the details of the rescore execution plan, such as the status +# rescore_execution_plan_description = kendra_ranking.describe_rescore_execution_plan( +# Id = rescore_execution_plan_id +# ) +# # When status is not CREATING quit. +# status = rescore_execution_plan_description["Status"] +# print(" Creating rescore execution plan. Status: "+status) +# time.sleep(60) +# if status != "CREATING": +# break + +# except ClientError as e: +# print("%s" % e) + +# print("Program ends.") + + + +def re_rank(self_, rerank_type, search_type, question, answers): + + print("start") + print() + + + ans = [] + ids = [] + ques_ans = [] + query = question[0]['question'] + for i in answers[0]['answer']: + if(self_ == "search"): + + ans.append({ + "Id": i['id'], + "Body": i["desc"], + "OriginalScore": i['score'], + "Title":i["desc"] + }) + ids.append(i['id']) + ques_ans.append((query,i["desc"])) + + else: + ans.append({'text':i}) + + ques_ans.append((query,i)) + + + + re_ranked = [{}] + + + + + + if(rerank_type == 'Kendra Rescore'): + + + + + rescore_response = kendra_ranking.rescore( + RescoreExecutionPlanId = 'b2a4d4f3-98ff-4e17-8b69-4c61ed7d91eb', + SearchQuery = query, + Documents = ans + ) + + + #[{'DocumentId': 'DocId1', 'Score': 2.0}, {'DocumentId': 'DocId2', 'Score': 1.0}] + + + re_ranked[0]['answer']=[] + for result in rescore_response["ResultItems"]: + + pos_ = ids.index(result['DocumentId']) + + re_ranked[0]['answer'].append(answers[0]['answer'][pos_]) + re_ranked[0]['search_type']=search_type, + re_ranked[0]['id'] = len(question) + + #st.session_state.answers_none_rank = st.session_state.answers + return re_ranked + + + # if(rerank_type == 'None'): + + # st.session_state.answers = st.session_state.answers_none_rank + + + if(rerank_type == 'Cross Encoder'): + + scores = model.predict( + ques_ans + ) + + print("scores") + print(scores) + index__ = 0 + for i in ans: + i['new_score'] = scores[index__] + index__ = index__+1 + + ans_sorted = sorted(ans, key=lambda d: d['new_score'],reverse=True) + + + def retreive_only_text(item): + return item['text'] + + if(self_ == 'rag'): + return list(map(retreive_only_text, ans_sorted)) + + + re_ranked[0]['answer']=[] + for j in ans_sorted: + pos_ = ids.index(j['Id']) + re_ranked[0]['answer'].append(answers[0]['answer'][pos_]) + re_ranked[0]['search_type']= search_type, + re_ranked[0]['id'] = len(question) + return re_ranked + + + + + #return st.session_state.answers + + + + +