Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,7 @@ from flask import jsonify
|
|
15 |
load_dotenv()
|
16 |
|
17 |
API_URL_EMBEDDINGS = f"https://e4e5-196-96-202-255.ngrok-free.app/embeddings"
|
|
|
18 |
|
19 |
# FAISS index setup
|
20 |
DIM = 768 # Adjust based on the embedding model
|
@@ -72,6 +73,10 @@ def store_document_data(PDF_FILE):
|
|
72 |
def retrieve_document(query):
|
73 |
print(f"Retrieving document based on:\n{query}")
|
74 |
|
|
|
|
|
|
|
|
|
75 |
# Generate query embedding
|
76 |
query_embedding = embedding_model.encode([query]).astype(np.float32)
|
77 |
|
@@ -95,7 +100,7 @@ def clean_text(text):
|
|
95 |
print("cleaning")
|
96 |
text = unicodedata.normalize("NFKC", text) # Normalize Unicode characters
|
97 |
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces and newlines
|
98 |
-
text = re.sub(r'[^a-zA-Z0-9
|
99 |
text = re.sub(r'(?i)(page\s*\d+)', '', text) # Remove page numbers
|
100 |
return text
|
101 |
|
|
|
15 |
load_dotenv()
|
16 |
|
17 |
API_URL_EMBEDDINGS = f"https://e4e5-196-96-202-255.ngrok-free.app/embeddings"
|
18 |
+
API_URL_METADATA = f"https://e4e5-196-96-202-255.ngrok-free.app/metadata"
|
19 |
|
20 |
# FAISS index setup
|
21 |
DIM = 768 # Adjust based on the embedding model
|
|
|
73 |
def retrieve_document(query):
|
74 |
print(f"Retrieving document based on:\n{query}")
|
75 |
|
76 |
+
embeddings_file = response.get(API_URL_EMBEDDINGS)
|
77 |
+
metadata_file = response.get(API_URL_METADATA)
|
78 |
+
|
79 |
+
print(embeddings_file, metadata_file)
|
80 |
# Generate query embedding
|
81 |
query_embedding = embedding_model.encode([query]).astype(np.float32)
|
82 |
|
|
|
100 |
print("cleaning")
|
101 |
text = unicodedata.normalize("NFKC", text) # Normalize Unicode characters
|
102 |
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces and newlines
|
103 |
+
text = re.sub(r'[^a-zA-Z0-9.,!?;:\\"()\-]', ' ', text) # Keep basic punctuation
|
104 |
text = re.sub(r'(?i)(page\s*\d+)', '', text) # Remove page numbers
|
105 |
return text
|
106 |
|