Update app.py
Browse files
app.py
CHANGED
@@ -68,8 +68,9 @@ class EmbeddingModel:
|
|
68 |
self.model = HuggingFaceEmbeddings(model_name=model_name)
|
69 |
self.max_tokens = max_tokens
|
70 |
|
71 |
-
def embed(self,
|
72 |
-
|
|
|
73 |
|
74 |
def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, max_tokens=None):
|
75 |
# File processing
|
@@ -78,41 +79,44 @@ def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, m
|
|
78 |
file_path = os.path.join(FILES_DIR, file)
|
79 |
text += FileHandler.extract_text(file_path)
|
80 |
|
81 |
-
# Split text
|
82 |
if split_strategy == 'token':
|
83 |
splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
84 |
else:
|
85 |
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
86 |
|
87 |
chunks = splitter.split_text(text)
|
|
|
|
|
88 |
model = EmbeddingModel(MODELS[model_name], max_tokens=max_tokens)
|
89 |
-
embeddings = model.embed(
|
90 |
|
91 |
return embeddings, chunks
|
92 |
|
93 |
def search_embeddings(query, model_name, top_k):
|
94 |
model = HuggingFaceEmbeddings(model_name=MODELS[model_name])
|
95 |
embeddings = model.embed_query(query)
|
96 |
-
|
|
|
|
|
|
|
|
|
97 |
|
98 |
def calculate_statistics(embeddings):
|
99 |
# Return time taken, token count, etc.
|
100 |
return {"tokens": len(embeddings), "time_taken": time.time()}
|
101 |
|
102 |
import shutil
|
|
|
103 |
def upload_file(file, model_name, split_strategy, chunk_size, overlap_size, max_tokens, query, top_k):
|
104 |
# Ensure default values are set if None is passed
|
105 |
-
if chunk_size
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
chunk_size = int(chunk_size) # Convert chunk_size to int
|
113 |
-
overlap_size = int(overlap_size) # Convert overlap_size to int
|
114 |
-
except ValueError:
|
115 |
-
return {"error": "Chunk size and overlap size must be valid integers."}
|
116 |
|
117 |
# Process files and get embeddings
|
118 |
embeddings, chunks = process_files(model_name, split_strategy, chunk_size, overlap_size, max_tokens)
|
|
|
68 |
self.model = HuggingFaceEmbeddings(model_name=model_name)
|
69 |
self.max_tokens = max_tokens
|
70 |
|
71 |
+
def embed(self, chunks: List[str]):
|
72 |
+
# Embed the list of chunks
|
73 |
+
return self.model.embed_documents(chunks)
|
74 |
|
75 |
def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, max_tokens=None):
|
76 |
# File processing
|
|
|
79 |
file_path = os.path.join(FILES_DIR, file)
|
80 |
text += FileHandler.extract_text(file_path)
|
81 |
|
82 |
+
# Split text into chunks
|
83 |
if split_strategy == 'token':
|
84 |
splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
85 |
else:
|
86 |
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
87 |
|
88 |
chunks = splitter.split_text(text)
|
89 |
+
|
90 |
+
# Embed chunks, not the full text
|
91 |
model = EmbeddingModel(MODELS[model_name], max_tokens=max_tokens)
|
92 |
+
embeddings = model.embed(chunks)
|
93 |
|
94 |
return embeddings, chunks
|
95 |
|
96 |
def search_embeddings(query, model_name, top_k):
|
97 |
model = HuggingFaceEmbeddings(model_name=MODELS[model_name])
|
98 |
embeddings = model.embed_query(query)
|
99 |
+
|
100 |
+
# Perform FAISS or other similarity-based search over embeddings
|
101 |
+
# This part requires you to build and search a FAISS index with embeddings
|
102 |
+
|
103 |
+
return embeddings # You would likely return the top-k results here
|
104 |
|
105 |
def calculate_statistics(embeddings):
|
106 |
# Return time taken, token count, etc.
|
107 |
return {"tokens": len(embeddings), "time_taken": time.time()}
|
108 |
|
109 |
import shutil
|
110 |
+
|
111 |
def upload_file(file, model_name, split_strategy, chunk_size, overlap_size, max_tokens, query, top_k):
|
112 |
# Ensure default values are set if None is passed
|
113 |
+
chunk_size = int(chunk_size) if chunk_size else 100
|
114 |
+
overlap_size = int(overlap_size) if overlap_size else 0
|
115 |
+
|
116 |
+
# Save uploaded file
|
117 |
+
file_path = os.path.join(FILES_DIR, file.name)
|
118 |
+
with open(file_path, "wb") as f:
|
119 |
+
shutil.copyfileobj(file, f) # Copy the uploaded file content to the destination
|
|
|
|
|
|
|
|
|
120 |
|
121 |
# Process files and get embeddings
|
122 |
embeddings, chunks = process_files(model_name, split_strategy, chunk_size, overlap_size, max_tokens)
|