Spaces:
Runtime error
Runtime error
File size: 5,063 Bytes
7f46a81 c10340e 7f46a81 6541511 7f46a81 f26592e 7f46a81 6541511 b4ea488 6541511 b4ea488 6541511 5fc81fd 7f46a81 b4ea488 f26592e a719df1 c10340e 5cebf82 7f46a81 b4ea488 7f46a81 d56438d 7f46a81 7ff5239 6541511 7f46a81 6541511 7f46a81 39e2176 a007296 8f83356 108fa2c 8f83356 c10340e 85d2afe af523d4 8f83356 7f46a81 f26592e 7f46a81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import requests
import json
import re
from urllib.parse import quote
def extract_between_tags(text, start_tag, end_tag):
start_index = text.find(start_tag)
end_index = text.find(end_tag, start_index)
return text[start_index+len(start_tag):end_index-len(end_tag)]
class VectaraQuery():
def __init__(self, api_key: str, customer_id: int, corpus_ids: list):
self.customer_id = customer_id
self.corpus_ids = corpus_ids
self.api_key = api_key
self.conv_id = "1dc2c542-925f-4b48-aab5-e3df6f2a3a64"
def submit_query(self, query_str: str):
corpora_key_list = [{
'customer_id': str(self.customer_id), 'corpus_id': str(corpus_id), 'lexical_interpolation_config': {'lambda': 0.025}
} for corpus_id in self.corpus_ids
]
endpoint = f"https://api.vectara.io/v1/query"
start_tag = "%START_SNIPPET%"
end_tag = "%END_SNIPPET%"
headers = {
"Content-Type": "application/json",
"Accept": "application/json",
"customer-id": str(self.customer_id),
"x-api-key": self.api_key,
"grpc-timeout": "60S"
}
body = {
'query': [
{
'query': query_str,
'start': 0,
'numResults': 50,
'corpusKey': corpora_key_list,
'context_config': {
'sentences_before': 2,
'sentences_after': 2,
'start_tag': start_tag,
'end_tag': end_tag,
},
'rerankingConfig':
{
'rerankerId': 272725718,
'mmrConfig': {
'diversityBias': 0.3
}
},
'summary': [
{
'responseLang': 'eng',
'maxSummarizedResults': 5,
'summarizerPromptName': 'vectara-experimental-summary-ext-2023-12-11-sml',
'chat': {
'store': True,
'conversationId': self.conv_id
},
'debug': True,
}
]
}
]
}
response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=headers)
if response.status_code != 200:
print(f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}")
return "Sorry, I'm experiencing an error. Please report this and try again later."
res = response.json()
top_k = 10
summary = res['responseSet'][0]['summary'][0]['text']
responses = res['responseSet'][0]['response'][:top_k]
docs = res['responseSet'][0]['document']
chat = res['responseSet'][0]['summary'][0]['chat']
# if chat['status'] != None:
# # chat['status'] = # I have no idea what to put here hahaha
# st_code = chat['status']
# print(f"Chat query failed with code {st_code}")
# if st_code == 'RESOURCE_EXHAUSTED':
# self.conv_id = None
# return 'Sorry, chat turns exceeds plan limit.'
# return 'Sorry, something went wrong in my brain. Please try again later.'
self.conv_id = res['responseSet'][0]['summary'][0]['chat']['conversationId']
print("BIG CHANGE")
pattern = r'\[\d{1,2}\]'
matches = [match.span() for match in re.finditer(pattern, summary)]
# figure out unique list of references
refs = []
for match in matches:
start, end = match
response_num = int(summary[start+1:end-1])
doc_num = responses[response_num-1]['documentIndex']
metadata = {item['name']: item['value'] for item in docs[doc_num]['metadata']}
text = extract_between_tags(responses[response_num-1]['text'], start_tag, end_tag)
url = f"{metadata['url']}#:~:text={quote(text)}"
if url not in refs:
refs.append(url)
# replace references with markdown links
refs_dict = {url:(inx+1) for inx,url in enumerate(refs)}
for match in reversed(matches):
start, end = match
response_num = int(summary[start+1:end-1])
doc_num = responses[response_num-1]['documentIndex']
metadata = {item['name']: item['value'] for item in docs[doc_num]['metadata']}
text = extract_between_tags(responses[response_num-1]['text'], start_tag, end_tag)
url = f"{metadata['url']}#:~:text={quote(text)}"
citation_inx = refs_dict[url]
summary = summary[:start] + f'[\[{citation_inx}\]]({url})' + summary[end:]
return summary
|