Muzammil6376 commited on
Commit
d179e57
Β·
verified Β·
1 Parent(s): 0a76168

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -112
app.py CHANGED
@@ -2,175 +2,170 @@ import os
2
  import shutil
3
  from typing import List
4
 
5
- import torch
6
  import gradio as gr
7
  from PIL import Image
8
 
9
- # Unstructured for PDF parsing
 
10
  from unstructured.partition.pdf import partition_pdf
11
  from unstructured.partition.utils.constants import PartitionStrategy
12
 
13
- # Vision-language captioning (BLIP)
14
- from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel
15
-
16
- # Hugging Face Inference client for LLM
17
- from huggingface_hub import InferenceClient
18
 
19
- # FAISS vectorstore
20
  from langchain_community.vectorstores import FAISS
21
-
22
- # Text embeddings
23
  from langchain_huggingface import HuggingFaceEmbeddings
24
 
 
 
 
 
 
 
25
  # ── Globals ───────────────────────────────────────────────────────────────────
26
  retriever = None
27
- current_pdf_name = None
28
- combined_texts: List[str] = [] # text chunks + captions
29
- combined_vectors: List[List[float]] = []
30
  pdf_text: str = ""
31
 
32
- # ── Setup ─────────────────────────────────────────────────────────────────────
33
  FIGURES_DIR = "figures"
34
  if os.path.exists(FIGURES_DIR):
35
  shutil.rmtree(FIGURES_DIR)
36
- else:
37
- os.makedirs(FIGURES_DIR, exist_ok=True)
 
 
38
 
39
- # ── Clients & Models ───────────────────────────────────────────────────────────
40
- hf = InferenceClient() # for chat completions
41
- txt_emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
42
 
 
43
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
44
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
45
 
46
- clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
47
- clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
48
-
49
 
50
  def generate_caption(image_path: str) -> str:
51
  image = Image.open(image_path).convert("RGB")
52
  inputs = blip_processor(image, return_tensors="pt")
53
- out = blip_model.generate(**inputs)
54
- return blip_processor.decode(out[0], skip_special_tokens=True)
55
-
56
-
57
- def embed_texts(texts: List[str]) -> List[List[float]]:
58
- return txt_emb.embed_documents(texts)
59
-
60
-
61
- def embed_images(image_paths: List[str]) -> List[List[float]]:
62
- feats = []
63
- for p in image_paths:
64
- img = Image.open(p).convert("RGB")
65
- inputs = clip_processor(images=img, return_tensors="pt")
66
- with torch.no_grad():
67
- v = clip_model.get_image_features(**inputs)
68
- feats.append(v[0].cpu().tolist())
69
- return feats
70
 
71
 
72
  def process_pdf(pdf_file):
73
- global retriever, current_pdf_name, combined_texts, combined_vectors, pdf_text
74
  if pdf_file is None:
75
- return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
76
 
77
- current_pdf_name = os.path.basename(pdf_file.name)
78
- # extract full text
79
- from pypdf import PdfReader
80
  reader = PdfReader(pdf_file.name)
81
- pages = [page.extract_text() or "" for page in reader.pages]
82
  pdf_text = "\n\n".join(pages)
83
 
84
- # rich parsing for images
85
  try:
86
- els = partition_pdf(
87
  filename=pdf_file.name,
88
  strategy=PartitionStrategy.HI_RES,
89
- extract_image_block_types=["Image","Table"],
90
  extract_image_block_output_dir=FIGURES_DIR,
91
  )
92
- texts = [e.text for e in els if e.category not in ["Image","Table"] and e.text]
93
- imgs = [os.path.join(FIGURES_DIR,f) for f in os.listdir(FIGURES_DIR)
94
- if f.lower().endswith((".png",".jpg",".jpeg"))]
95
  except:
96
- texts = pages
97
- imgs = []
 
 
 
98
 
99
- # split text chunks
100
- from langchain.text_splitter import CharacterTextSplitter
101
  splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
102
  chunks = []
103
- for t in texts:
104
  chunks.extend(splitter.split_text(t))
105
- caps = [generate_caption(i) for i in imgs]
106
-
107
- # embed
108
- tvecs = embed_texts(chunks + caps)
109
- ivecs = embed_images(imgs)
110
- # align dims: captions embedded twice? simple: drop caps embeddings from tvecs
111
- text_count = len(chunks)
112
- cap_count = len(caps)
113
- # use text embeddings for text and clip for images
114
- combined_texts = chunks + caps
115
- combined_vectors = tvecs[:text_count] + ivecs
116
-
117
- # Build FAISS index from precomputed multimodal vectors:
118
- index = FAISS.from_embeddings(
119
- combined_vectors,
120
- combined_texts,
121
- )
122
  retriever = index.as_retriever(search_kwargs={"k":2})
123
- status = f"βœ… Indexed '{current_pdf_name}' β€” {len(chunks)} text chunks + {len(imgs)} images"
124
- return current_pdf_name, status, gr.update(interactive=True)
 
125
 
126
 
127
- def ask_question(pdf_name,question):
128
- global retriever
129
  if retriever is None:
130
- return "❌ Please process a PDF first."
131
- if not question.strip():
132
- return "❌ Enter a question."
 
133
  docs = retriever.get_relevant_documents(question)
134
- ctx = "\n\n".join(d.page_content for d in docs)
135
- prompt = f"Use contexts:\n{ctx}\nQuestion:{question}\nAnswer:"
136
- res = hf.chat_completion(model="google/gemma-3-27b-it",messages=[{"role":"user","content":prompt}],max_tokens=128)
 
 
 
 
 
 
137
  return res["choices"][0]["message"]["content"].strip()
138
 
139
 
140
- def generate_summary(): return ask_question(None,"Summarize:\n"+pdf_text[:2000])
 
 
 
 
 
 
 
 
 
141
 
142
- def extract_keywords(): return ask_question(None,"Extract keywords:\n"+pdf_text[:2000])
143
 
144
- def clear_interface():
145
- global retriever,combined_texts,combined_vectors,pdf_text
146
- retriever=None
147
- combined_texts=[]
148
- combined_vectors=[]
149
- pdf_text=""
150
- shutil.rmtree(FIGURES_DIR,ignore_errors=True)
151
- os.makedirs(FIGURES_DIR,exist_ok=True)
152
  return None, "", gr.update(interactive=False)
153
 
154
- # UI
155
- theme=gr.themes.Soft(primary_hue="indigo",secondary_hue="blue")
156
  with gr.Blocks(theme=theme) as demo:
157
- gr.Markdown("# DocQueryAI (True Multimodal RAG)")
158
  with gr.Row():
159
  with gr.Column():
160
- pdf_disp=gr.Textbox(label="Active Document",interactive=False)
161
- pdf_file=gr.File(file_types=[".pdf"],type="filepath")
162
- btn_process=gr.Button("Process PDF")
163
- status=gr.Textbox(interactive=False)
 
164
  with gr.Column():
165
- q_in=gr.Textbox(lines=3,interactive=False)
166
- btn_ask=gr.Button("Ask")
167
- ans=gr.Textbox(interactive=False)
168
- btn_sum=gr.Button("Summary",interactive=False);out_sum=gr.Textbox(interactive=False)
169
- btn_key=gr.Button("Keywords",interactive=False);out_key=gr.Textbox(interactive=False)
170
- btn_clear=gr.Button("Clear All")
171
- btn_process.click(process_pdf,[pdf_file],[pdf_disp,status,q_in])
172
- btn_ask.click(ask_question,[pdf_disp,q_in],ans)
173
- btn_sum.click(generate_summary,[],out_sum)
174
- btn_key.click(extract_keywords,[],out_key)
175
- btn_clear.click(clear_interface,[],[pdf_disp,status,q_in])
176
- if __name__=="__main__": demo.launch()
 
 
 
 
 
 
 
 
 
2
  import shutil
3
  from typing import List
4
 
 
5
  import gradio as gr
6
  from PIL import Image
7
 
8
+ # PDF parsing
9
+ from pypdf import PdfReader
10
  from unstructured.partition.pdf import partition_pdf
11
  from unstructured.partition.utils.constants import PartitionStrategy
12
 
13
+ # Text splitting
14
+ from langchain.text_splitter import CharacterTextSplitter
 
 
 
15
 
16
+ # Vectorstore and embeddings
17
  from langchain_community.vectorstores import FAISS
 
 
18
  from langchain_huggingface import HuggingFaceEmbeddings
19
 
20
+ # Vision-language captioning (BLIP)
21
+ from transformers import BlipProcessor, BlipForConditionalGeneration
22
+
23
+ # LLM via HF Inference API
24
+ from huggingface_hub import InferenceClient
25
+
26
  # ── Globals ───────────────────────────────────────────────────────────────────
27
  retriever = None
 
 
 
28
  pdf_text: str = ""
29
 
30
+ # ── Setup directories ──────────────────────────────────────────────────────────
31
  FIGURES_DIR = "figures"
32
  if os.path.exists(FIGURES_DIR):
33
  shutil.rmtree(FIGURES_DIR)
34
+ os.makedirs(FIGURES_DIR, exist_ok=True)
35
+
36
+ # ── Models & Clients ───────────────────────────────────────────────────────────
37
+ hf_client = InferenceClient() # uses HUGGINGFACEHUB_API_TOKEN
38
 
39
+ # Embeddings model (local lightweight SBERT)
40
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
41
 
42
+ # BLIP for image captioning
43
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
44
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
45
 
46
+ # ── Helper functions ───────────────────────────────────────────────────────────
 
 
47
 
48
  def generate_caption(image_path: str) -> str:
49
  image = Image.open(image_path).convert("RGB")
50
  inputs = blip_processor(image, return_tensors="pt")
51
+ outputs = blip_model.generate(**inputs)
52
+ return blip_processor.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
 
55
  def process_pdf(pdf_file):
56
+ global retriever, pdf_text
57
  if pdf_file is None:
58
+ return None, "❌ Please upload a PDF.", gr.update(interactive=False)
59
 
60
+ # read full text
 
 
61
  reader = PdfReader(pdf_file.name)
62
+ pages = [p.extract_text() or "" for p in reader.pages]
63
  pdf_text = "\n\n".join(pages)
64
 
65
+ # extract elements with images via unstructured
66
  try:
67
+ elements = partition_pdf(
68
  filename=pdf_file.name,
69
  strategy=PartitionStrategy.HI_RES,
70
+ extract_image_block_types=["Image", "Table"],
71
  extract_image_block_output_dir=FIGURES_DIR,
72
  )
73
+ text_elems = [e.text for e in elements if e.category not in ["Image","Table"] and e.text]
74
+ image_files = [os.path.join(FIGURES_DIR, f) for f in os.listdir(FIGURES_DIR)
75
+ if f.lower().endswith((".png",".jpg",".jpeg"))]
76
  except:
77
+ text_elems = pages
78
+ image_files = []
79
+
80
+ # generate captions
81
+ captions = [generate_caption(img) for img in image_files]
82
 
83
+ # split text into chunks
 
84
  splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
85
  chunks = []
86
+ for t in text_elems:
87
  chunks.extend(splitter.split_text(t))
88
+
89
+ # combine text chunks and image captions
90
+ docs = chunks + captions
91
+
92
+ # embed and index
93
+ vectors = embeddings.embed_documents(docs)
94
+ index = FAISS.from_embeddings(vectors, docs)
 
 
 
 
 
 
 
 
 
 
95
  retriever = index.as_retriever(search_kwargs={"k":2})
96
+
97
+ status = f"βœ… Indexed β€” {len(chunks)} text chunks + {len(captions)} captions"
98
+ return os.path.basename(pdf_file.name), status, gr.update(interactive=True)
99
 
100
 
101
+ def ask_question(pdf_name, question):
 
102
  if retriever is None:
103
+ return "❌ Please upload + index a PDF first."
104
+ if not question:
105
+ return "❌ Please ask something."
106
+
107
  docs = retriever.get_relevant_documents(question)
108
+ context = "\n\n".join(d.page_content for d in docs)
109
+ prompt = f"Use the following excerpts to answer:\n{context}\nQuestion: {question}\nAnswer:"
110
+
111
+ res = hf_client.chat_completion(
112
+ model="google/gemma-3-27b-it",
113
+ messages=[{"role":"user","content":prompt}],
114
+ max_tokens=128,
115
+ temperature=0.5,
116
+ )
117
  return res["choices"][0]["message"]["content"].strip()
118
 
119
 
120
+ def generate_summary():
121
+ if not pdf_text:
122
+ return "❌ Please index a PDF first."
123
+ return ask_question(None, f"Summarize concisely:\n{pdf_text[:2000]}")
124
+
125
+
126
+ def extract_keywords():
127
+ if not pdf_text:
128
+ return "❌ Please index first."
129
+ return ask_question(None, f"Extract 10–15 key terms:\n{pdf_text[:2000]}")
130
 
 
131
 
132
+ def clear_all():
133
+ global retriever, pdf_text
134
+ retriever = None
135
+ pdf_text = ""
136
+ shutil.rmtree(FIGURES_DIR, ignore_errors=True)
137
+ os.makedirs(FIGURES_DIR, exist_ok=True)
 
 
138
  return None, "", gr.update(interactive=False)
139
 
140
+ # ── Gradio UI ────────────────────────────────────────────────────────────────
141
+ theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
142
  with gr.Blocks(theme=theme) as demo:
143
+ gr.Markdown("# Multimodal RAG with HF & LangChain")
144
  with gr.Row():
145
  with gr.Column():
146
+ pdf_disp = gr.Textbox(label="Active PDF", interactive=False)
147
+ pdf_file = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
148
+ btn_proc = gr.Button("πŸ“„ Process PDF")
149
+ status = gr.Textbox(label="Status", interactive=False)
150
+
151
  with gr.Column():
152
+ q_in = gr.Textbox(label="Your question", interactive=False)
153
+ btn_ask = gr.Button("❓ Ask", interactive=False)
154
+ ans = gr.Textbox(label="Answer", interactive=False)
155
+
156
+ with gr.Row():
157
+ btn_sum = gr.Button("πŸ“‹ Summary", interactive=False)
158
+ sum_out = gr.Textbox(interactive=False)
159
+ btn_key = gr.Button("🏷️ Keywords", interactive=False)
160
+ key_out = gr.Textbox(interactive=False)
161
+
162
+ btn_clear = gr.Button("πŸ—‘οΈ Clear All")
163
+
164
+ btn_proc.click(process_pdf, [pdf_file], [pdf_disp, status, q_in])
165
+ btn_ask.click(ask_question, [pdf_disp, q_in], ans)
166
+ btn_sum.click(generate_summary, [], sum_out)
167
+ btn_key.click(extract_keywords, [], key_out)
168
+ btn_clear.click(clear_all, [], [pdf_disp, status, q_in])
169
+
170
+ if __name__ == "__main__":
171
+ demo.launch(debug=True)