Spaces:

not-lain
/

PDF-Search-Engine

Running

App Files Files

xet

Community

not-lain commited on Dec 14, 2023

Commit

320f164

1 Parent(s): a023810

error handling

Browse files

Files changed (1) hide show

app.py +20 -11

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ def process_pdfs(parent_dir: Union[str,list]):
         parent_dir = [parent_dir]
     for file_path in parent_dir:
         if ".pdf" not in file_path : # skip non pdf files
-            continue
         # creating a pdf file object
         pdfFileObj = open(file_path, 'rb')
@@ -48,8 +48,8 @@ def process_pdfs(parent_dir: Union[str,list]):
             txt = txt.replace("\t","") # strip tabs
             txt = re.sub(r" +"," ",txt) # strip extra space
             # 512 is related to the positional encoding "facebook/dpr-ctx_encoder-single-nq-base" model
             if len(txt) < 512 :
-                file_name = file_path.split("/")[-1]
                 new_data = {"title":f"{file_name}-page-{i}","text":txt}
                 df = df.append(new_data,ignore_index=True)
             else :
@@ -70,6 +70,8 @@ def process(example):
 def process_dataset(df):
     """processess the dataframe and returns a dataset variable"""
     ds = Dataset.from_pandas(df)
     ds = ds.map(process)
     ds.add_faiss_index(column='embeddings') # add faiss index
@@ -77,19 +79,26 @@ def process_dataset(df):
 def search(query, ds, k=3):
     """searches the query in the dataset and returns the k most similar"""
-    tokens = q_tokenizer(query, return_tensors="pt")
-    query_embed = q_encoder(**tokens)[0][0].numpy()
-    scores, retrieved_examples = ds.get_nearest_examples("embeddings", query_embed, k=k)
-    out = f"""title : {retrieved_examples["title"][0]},\ncontent: {retrieved_examples["text"][0]}
-    similar resources: {retrieved_examples["title"]}
-    """
     return out
 def predict(query,file_paths, k=3):
     """predicts the most similar files to the query"""
-    df = process_pdfs(file_paths)
-    ds = process_dataset(df)
-    return search(query,ds,k=k)
 with gr.Blocks() as demo :
     with gr.Column():

         parent_dir = [parent_dir]
     for file_path in parent_dir:
         if ".pdf" not in file_path : # skip non pdf files
+            raise Exception("only pdf files are supported")
         # creating a pdf file object
         pdfFileObj = open(file_path, 'rb')
             txt = txt.replace("\t","") # strip tabs
             txt = re.sub(r" +"," ",txt) # strip extra space
             # 512 is related to the positional encoding "facebook/dpr-ctx_encoder-single-nq-base" model
+            file_name = file_path.split("/")[-1]
             if len(txt) < 512 :
                 new_data = {"title":f"{file_name}-page-{i}","text":txt}
                 df = df.append(new_data,ignore_index=True)
             else :
 def process_dataset(df):
     """processess the dataframe and returns a dataset variable"""
+    if len(df) == 0 :
+        raise Exception("empty pdf files, or can't read text from them")
     ds = Dataset.from_pandas(df)
     ds = ds.map(process)
     ds.add_faiss_index(column='embeddings') # add faiss index
 def search(query, ds, k=3):
     """searches the query in the dataset and returns the k most similar"""
+    try :
+        tokens = q_tokenizer(query, return_tensors="pt")
+        query_embed = q_encoder(**tokens)[0][0].numpy()
+        scores, retrieved_examples = ds.get_nearest_examples("embeddings", query_embed, k=k)
+        out = f"""title : {retrieved_examples["title"][0]},\ncontent: {retrieved_examples["text"][0]}
+        similar resources: {retrieved_examples["title"]}
+        """
+    except Exception as e:
+        out = f"error: {e}"
     return out
 def predict(query,file_paths, k=3):
     """predicts the most similar files to the query"""
+    try :
+        df = process_pdfs(file_paths)
+        ds = process_dataset(df)
+        out =  search(query,ds,k=k)
+    except Exception as e:
+        out = f"error: {e}"
+    return out
 with gr.Blocks() as demo :
     with gr.Column():