Spaces:
Running
Running
error handling
Browse files
app.py
CHANGED
@@ -32,7 +32,7 @@ def process_pdfs(parent_dir: Union[str,list]):
|
|
32 |
parent_dir = [parent_dir]
|
33 |
for file_path in parent_dir:
|
34 |
if ".pdf" not in file_path : # skip non pdf files
|
35 |
-
|
36 |
# creating a pdf file object
|
37 |
pdfFileObj = open(file_path, 'rb')
|
38 |
|
@@ -48,8 +48,8 @@ def process_pdfs(parent_dir: Union[str,list]):
|
|
48 |
txt = txt.replace("\t","") # strip tabs
|
49 |
txt = re.sub(r" +"," ",txt) # strip extra space
|
50 |
# 512 is related to the positional encoding "facebook/dpr-ctx_encoder-single-nq-base" model
|
|
|
51 |
if len(txt) < 512 :
|
52 |
-
file_name = file_path.split("/")[-1]
|
53 |
new_data = {"title":f"{file_name}-page-{i}","text":txt}
|
54 |
df = df.append(new_data,ignore_index=True)
|
55 |
else :
|
@@ -70,6 +70,8 @@ def process(example):
|
|
70 |
|
71 |
def process_dataset(df):
|
72 |
"""processess the dataframe and returns a dataset variable"""
|
|
|
|
|
73 |
ds = Dataset.from_pandas(df)
|
74 |
ds = ds.map(process)
|
75 |
ds.add_faiss_index(column='embeddings') # add faiss index
|
@@ -77,19 +79,26 @@ def process_dataset(df):
|
|
77 |
|
78 |
def search(query, ds, k=3):
|
79 |
"""searches the query in the dataset and returns the k most similar"""
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
86 |
return out
|
87 |
|
88 |
def predict(query,file_paths, k=3):
|
89 |
"""predicts the most similar files to the query"""
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
93 |
|
94 |
with gr.Blocks() as demo :
|
95 |
with gr.Column():
|
|
|
32 |
parent_dir = [parent_dir]
|
33 |
for file_path in parent_dir:
|
34 |
if ".pdf" not in file_path : # skip non pdf files
|
35 |
+
raise Exception("only pdf files are supported")
|
36 |
# creating a pdf file object
|
37 |
pdfFileObj = open(file_path, 'rb')
|
38 |
|
|
|
48 |
txt = txt.replace("\t","") # strip tabs
|
49 |
txt = re.sub(r" +"," ",txt) # strip extra space
|
50 |
# 512 is related to the positional encoding "facebook/dpr-ctx_encoder-single-nq-base" model
|
51 |
+
file_name = file_path.split("/")[-1]
|
52 |
if len(txt) < 512 :
|
|
|
53 |
new_data = {"title":f"{file_name}-page-{i}","text":txt}
|
54 |
df = df.append(new_data,ignore_index=True)
|
55 |
else :
|
|
|
70 |
|
71 |
def process_dataset(df):
|
72 |
"""processess the dataframe and returns a dataset variable"""
|
73 |
+
if len(df) == 0 :
|
74 |
+
raise Exception("empty pdf files, or can't read text from them")
|
75 |
ds = Dataset.from_pandas(df)
|
76 |
ds = ds.map(process)
|
77 |
ds.add_faiss_index(column='embeddings') # add faiss index
|
|
|
79 |
|
80 |
def search(query, ds, k=3):
|
81 |
"""searches the query in the dataset and returns the k most similar"""
|
82 |
+
try :
|
83 |
+
tokens = q_tokenizer(query, return_tensors="pt")
|
84 |
+
query_embed = q_encoder(**tokens)[0][0].numpy()
|
85 |
+
scores, retrieved_examples = ds.get_nearest_examples("embeddings", query_embed, k=k)
|
86 |
+
out = f"""title : {retrieved_examples["title"][0]},\ncontent: {retrieved_examples["text"][0]}
|
87 |
+
similar resources: {retrieved_examples["title"]}
|
88 |
+
"""
|
89 |
+
except Exception as e:
|
90 |
+
out = f"error: {e}"
|
91 |
return out
|
92 |
|
93 |
def predict(query,file_paths, k=3):
|
94 |
"""predicts the most similar files to the query"""
|
95 |
+
try :
|
96 |
+
df = process_pdfs(file_paths)
|
97 |
+
ds = process_dataset(df)
|
98 |
+
out = search(query,ds,k=k)
|
99 |
+
except Exception as e:
|
100 |
+
out = f"error: {e}"
|
101 |
+
return out
|
102 |
|
103 |
with gr.Blocks() as demo :
|
104 |
with gr.Column():
|