Spaces:

ahmedkasem
/

quran-nlp

Sleeping

App Files Files Community

deveix commited on Apr 13, 2024

Commit

886c1e1

1 Parent(s): 5b25c6e

fix search

Browse files

Files changed (1) hide show

app/main.py +27 -12

app/main.py CHANGED Viewed

@@ -42,23 +42,38 @@ app.add_middleware(
 def index_file(filepath):
-    """ Index each line in a file for quick search. Returns a dictionary with key as content and value as line number. """
     index = {}
     with open(filepath, 'r', encoding='utf-8') as file:
-        for line_number, line in enumerate(file, 1):  # Starting line numbers at 1 for human readability
-            index[line.strip()] = line_number
     return index
-def get_text_by_line_number(filepath, line_numbers):
-    """ Retrieve specific lines from a file based on line numbers. """
-    lines = {}
     with open(filepath, 'r', encoding='utf-8') as file:
-        for line_number, line in enumerate(file, 1):
-            if line_number in line_numbers:
-                lines[line_number] = line.strip()
-                if len(lines) == len(line_numbers):  # Stop reading once all required lines are read
                     break
-    return lines
 # Existing API endpoints
@@ -105,7 +120,7 @@ async def get_answer(item: Item, token: str = Depends(verify_token)):
         line_numbers = [answers_index[answer] for answer in clean_answers if answer in answers_index]
         # Assuming 'retrieve_file.txt' is where we retrieve lines based on line numbers
-        result_text = get_text_by_line_number('app/quran_tafseer.txt', line_numbers)
         return {"result_text": result_text}
     except Exception as e:

 def index_file(filepath):
+    """ Index each block in a file separated by double newlines for quick search.
+    Returns a dictionary with key as content and value as block number. """
     index = {}
     with open(filepath, 'r', encoding='utf-8') as file:
+        content = file.read()  # Read the whole file at once
+        blocks = content.split("\n\n")  # Split the content by double newlines
+        for block_number, block in enumerate(blocks, 1):  # Starting block numbers at 1 for human readability
+            # Replace single newlines within blocks with space and strip leading/trailing whitespace
+            formatted_block = ' '.join(block.split('\n')).strip()
+            index[formatted_block] = block_number
+            # if(block_number == 100):
+            #     print(formatted_block)  # Print the 5th block
     return index
+def get_text_by_block_number(filepath, block_numbers):
+    """ Retrieve specific blocks from a file based on block numbers, where each block is separated by '\n\n'. """
+    blocks_text = []
     with open(filepath, 'r', encoding='utf-8') as file:
+        content = file.read()  # Read the whole file at once
+        blocks = content.split("\n\n")  # Split the content by double newlines
+        for block_number, block in enumerate(blocks, 1):  # Starting block numbers at 1 for human readability
+            if block_number in block_numbers:
+                # Replace single newlines within blocks with space and strip leading/trailing whitespace
+                formatted_block = ' '.join(block.split('\n')).strip()
+                blocks_text.append(formatted_block)
+                if len(blocks_text) == len(block_numbers):  # Stop reading once all required blocks are retrieved
                     break
+    return blocks_text
 # Existing API endpoints
         line_numbers = [answers_index[answer] for answer in clean_answers if answer in answers_index]
         # Assuming 'retrieve_file.txt' is where we retrieve lines based on line numbers
+        result_text = get_text_by_block_number('app/quran_tafseer.txt', line_numbers)
         return {"result_text": result_text}
     except Exception as e: