InkeyDevelopment commited on
Commit
c85bec4
·
verified ·
1 Parent(s): 98e278f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -67
app.py CHANGED
@@ -17,78 +17,13 @@ import json
17
  import os
18
  load_dotenv()
19
 
20
- def extract_text_images(pdf_path, output_dir="static/output_images"):
21
- doc = fitz.open(pdf_path)
22
- data = []
23
-
24
- if not os.path.exists(output_dir):
25
- os.makedirs(output_dir)
26
-
27
- for page_num in range(len(doc)):
28
- page = doc[page_num]
29
- text = page.get_text("text")
30
-
31
- images = page.get_images(full=True)
32
- image_paths = []
33
-
34
- for img_index, img in enumerate(images):
35
- xref = img[0]
36
- base_image = doc.extract_image(xref)
37
- image_bytes = base_image["image"]
38
- image_ext = base_image["ext"]
39
- image_filename = f"{output_dir}/page_{page_num+1}_img_{img_index+1}.{image_ext}"
40
-
41
- with open(image_filename, "wb") as img_file:
42
- img_file.write(image_bytes)
43
-
44
- image_paths.append(image_filename)
45
-
46
- data.append({"page": page_num + 1, "text": text, "images": image_paths})
47
-
48
- with open("pdf_data.json", "w") as f:
49
- json.dump(data, f, indent=4)
50
-
51
- return "Extraction completed!"
52
-
53
- pdf_path = "./Exelsys easyHR v10 User Guide.pdf"
54
- extract_text_images(pdf_path)
55
-
56
-
57
- # Load Hugging Face model
58
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
59
-
60
- def get_embedding(text):
61
- return model.encode(text, convert_to_numpy=True)
62
-
63
- def store_embeddings():
64
- with open("pdf_data.json") as f:
65
- data = json.load(f)
66
-
67
- dimension = 384
68
- index = faiss.IndexFlatL2(dimension)
69
- metadata = []
70
-
71
- for i, entry in enumerate(data):
72
- embedding = np.array(get_embedding(entry["text"])).astype("float32")
73
- index.add(np.array([embedding]))
74
- metadata.append({"page": entry["page"], "text": entry["text"], "images": entry["images"]})
75
-
76
- faiss.write_index(index, "faiss_index.bin")
77
-
78
- with open("metadata.json", "w") as f:
79
- json.dump(metadata, f, indent=4)
80
-
81
- return "Embeddings stored successfully!"
82
-
83
- store_embeddings()
84
-
85
 
86
 
87
  app = Flask(__name__)
88
 
89
  # Load Model and FAISS Index
90
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
91
- index = faiss.read_index("faiss_index.bin")
92
 
93
  groq_api_key = os.getenv('GROQ_API_KEY')
94
  model_name = "llama-3.3-70b-versatile"
@@ -99,7 +34,7 @@ llm = ChatGroq(
99
  model_name=model_name
100
  )
101
 
102
- with open("metadata.json") as f:
103
  metadata = json.load(f)
104
 
105
 
 
17
  import os
18
  load_dotenv()
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  app = Flask(__name__)
23
 
24
  # Load Model and FAISS Index
25
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
26
+ index = faiss.read_index("./faiss_index.bin")
27
 
28
  groq_api_key = os.getenv('GROQ_API_KEY')
29
  model_name = "llama-3.3-70b-versatile"
 
34
  model_name=model_name
35
  )
36
 
37
+ with open("./metadata.json") as f:
38
  metadata = json.load(f)
39
 
40