Chris4K commited on
Commit
1449a38
·
verified ·
1 Parent(s): e267239

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -16
app.py CHANGED
@@ -68,8 +68,9 @@ class EmbeddingModel:
68
  self.model = HuggingFaceEmbeddings(model_name=model_name)
69
  self.max_tokens = max_tokens
70
 
71
- def embed(self, text):
72
- return self.model.embed_documents([text])
 
73
 
74
  def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, max_tokens=None):
75
  # File processing
@@ -78,41 +79,44 @@ def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, m
78
  file_path = os.path.join(FILES_DIR, file)
79
  text += FileHandler.extract_text(file_path)
80
 
81
- # Split text
82
  if split_strategy == 'token':
83
  splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
84
  else:
85
  splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
86
 
87
  chunks = splitter.split_text(text)
 
 
88
  model = EmbeddingModel(MODELS[model_name], max_tokens=max_tokens)
89
- embeddings = model.embed(text)
90
 
91
  return embeddings, chunks
92
 
93
  def search_embeddings(query, model_name, top_k):
94
  model = HuggingFaceEmbeddings(model_name=MODELS[model_name])
95
  embeddings = model.embed_query(query)
96
- return embeddings
 
 
 
 
97
 
98
  def calculate_statistics(embeddings):
99
  # Return time taken, token count, etc.
100
  return {"tokens": len(embeddings), "time_taken": time.time()}
101
 
102
  import shutil
 
103
  def upload_file(file, model_name, split_strategy, chunk_size, overlap_size, max_tokens, query, top_k):
104
  # Ensure default values are set if None is passed
105
- if chunk_size is None:
106
- chunk_size = 100 # Set a default chunk size
107
- if overlap_size is None:
108
- overlap_size = 0 # Set a default overlap size
109
-
110
- # Convert chunk_size and overlap_size to integers after checking for None
111
- try:
112
- chunk_size = int(chunk_size) # Convert chunk_size to int
113
- overlap_size = int(overlap_size) # Convert overlap_size to int
114
- except ValueError:
115
- return {"error": "Chunk size and overlap size must be valid integers."}
116
 
117
  # Process files and get embeddings
118
  embeddings, chunks = process_files(model_name, split_strategy, chunk_size, overlap_size, max_tokens)
 
68
  self.model = HuggingFaceEmbeddings(model_name=model_name)
69
  self.max_tokens = max_tokens
70
 
71
+ def embed(self, chunks: List[str]):
72
+ # Embed the list of chunks
73
+ return self.model.embed_documents(chunks)
74
 
75
  def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, max_tokens=None):
76
  # File processing
 
79
  file_path = os.path.join(FILES_DIR, file)
80
  text += FileHandler.extract_text(file_path)
81
 
82
+ # Split text into chunks
83
  if split_strategy == 'token':
84
  splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
85
  else:
86
  splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
87
 
88
  chunks = splitter.split_text(text)
89
+
90
+ # Embed chunks, not the full text
91
  model = EmbeddingModel(MODELS[model_name], max_tokens=max_tokens)
92
+ embeddings = model.embed(chunks)
93
 
94
  return embeddings, chunks
95
 
96
  def search_embeddings(query, model_name, top_k):
97
  model = HuggingFaceEmbeddings(model_name=MODELS[model_name])
98
  embeddings = model.embed_query(query)
99
+
100
+ # Perform FAISS or other similarity-based search over embeddings
101
+ # This part requires you to build and search a FAISS index with embeddings
102
+
103
+ return embeddings # You would likely return the top-k results here
104
 
105
  def calculate_statistics(embeddings):
106
  # Return time taken, token count, etc.
107
  return {"tokens": len(embeddings), "time_taken": time.time()}
108
 
109
  import shutil
110
+
111
  def upload_file(file, model_name, split_strategy, chunk_size, overlap_size, max_tokens, query, top_k):
112
  # Ensure default values are set if None is passed
113
+ chunk_size = int(chunk_size) if chunk_size else 100
114
+ overlap_size = int(overlap_size) if overlap_size else 0
115
+
116
+ # Save uploaded file
117
+ file_path = os.path.join(FILES_DIR, file.name)
118
+ with open(file_path, "wb") as f:
119
+ shutil.copyfileobj(file, f) # Copy the uploaded file content to the destination
 
 
 
 
120
 
121
  # Process files and get embeddings
122
  embeddings, chunks = process_files(model_name, split_strategy, chunk_size, overlap_size, max_tokens)