Tzetha commited on
Commit
79e6c24
Β·
1 Parent(s): 46f1db8

Added Files

Browse files
Files changed (3) hide show
  1. app.py +209 -0
  2. requirements.txt +10 -0
  3. setup.sh +2 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ from pytesseract import pytesseract
4
+ import PyPDF2
5
+ import enum
6
+ import os
7
+ import re
8
+ from collections import defaultdict
9
+ import folium
10
+ from streamlit_folium import st_folium
11
+ from geopy.geocoders import Nominatim
12
+ from geopy.exc import GeocoderTimedOut
13
+ import wikipedia
14
+ from transformers import pipeline
15
+ from openai import OpenAI
16
+
17
+ # NVIDIA OpenAI API Setup
18
+ client = OpenAI(
19
+ base_url="https://integrate.api.nvidia.com/v1",
20
+ api_key="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" # Replace with actual API key
21
+ )
22
+
23
+ # Load Named Entity Recognition (NER) Model
24
+ nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
25
+
26
+ st.set_page_config(page_title="OCR & Historical Analysis", page_icon="πŸ“œ", layout="wide")
27
+
28
+ # Custom Styling
29
+ def style_text(text):
30
+ return f"""
31
+ <div style='padding:10px;border-radius:10px;
32
+ background-color:#e0e0e0;
33
+ color:#333;
34
+ font-weight:500;
35
+ font-size:16px;'>
36
+ {text}
37
+ </div>
38
+ """
39
+
40
+ def find_related_documents(query):
41
+ try:
42
+ search_results = wikipedia.search(query, results=5)
43
+ links = [wikipedia.page(result).url for result in search_results]
44
+ return links
45
+ except Exception as e:
46
+ return [f"Error retrieving related documents: {str(e)}"]
47
+
48
+ def geocode_location(location):
49
+ geolocator = Nominatim(user_agent="streamlit_app")
50
+ try:
51
+ loc = geolocator.geocode(location, timeout=10)
52
+ return (loc.latitude, loc.longitude) if loc else None
53
+ except GeocoderTimedOut:
54
+ return None
55
+
56
+ def generate_historical_context_nvidia(text):
57
+ """Use NVIDIA OpenAI API to generate a structured, summarized historical context."""
58
+
59
+ prompt_analysis = f"""
60
+ Analyze the following text and provide a historical context. Identify:
61
+ - Key historical events
62
+ - Significant figures involved
63
+ - The broader historical significance
64
+
65
+ Text: {text}
66
+
67
+ Provide a detailed response.
68
+ """
69
+
70
+ prompt_summary = """
71
+ Summarize the historical context provided above in a concise and structured format:
72
+ - Limit to 5 bullet points
73
+ - Each bullet point should be under 100 words
74
+ - Avoid unnecessary explanations or preambleβ€”return only the summary
75
+ """
76
+
77
+ try:
78
+ # Step 1: Generate Detailed Historical Context
79
+ completion = client.chat.completions.create(
80
+ model="deepseek-ai/deepseek-r1",
81
+ messages=[
82
+ {"role": "system", "content": "You are a historian providing detailed historical insights."},
83
+ {"role": "user", "content": prompt_analysis}
84
+ ],
85
+ temperature=0.4,
86
+ top_p=0.9,
87
+ max_tokens=4096,
88
+ stream=False
89
+ )
90
+ detailed_response = completion.choices[0].message.content.strip()
91
+
92
+ # Step 2: Summarize the Historical Context **without Monologue**
93
+ summary_completion = client.chat.completions.create(
94
+ model="deepseek-ai/deepseek-r1",
95
+ messages=[
96
+ {"role": "system", "content": "You are an expert summarizer."},
97
+ {"role": "user", "content": f"{detailed_response}\n\n{prompt_summary}"}
98
+ ],
99
+ temperature=0.4,
100
+ top_p=0.9,
101
+ max_tokens=2048,
102
+ stream=False
103
+ )
104
+
105
+ # Extract only the structured summary
106
+ summary_response = summary_completion.choices[0].message.content.strip()
107
+
108
+ # Remove AI-generated explanations or redundant preamble
109
+ clean_summary = re.sub(r"^.*?\n\n", "", summary_response, flags=re.DOTALL)
110
+
111
+ return clean_summary if clean_summary else "No historical context found."
112
+
113
+ except Exception as e:
114
+ return f"Error retrieving AI-generated historical context: {str(e)}"
115
+
116
+
117
+ class OS(enum.Enum):
118
+ Mac = 0
119
+ Windows = 1
120
+
121
+ class Languages(enum.Enum):
122
+ English = "eng"
123
+ Filipino = "fil"
124
+ Spanish = "spa"
125
+
126
+ class ImageReader:
127
+ def __init__(self, os):
128
+ if os == OS.Windows:
129
+ pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
130
+
131
+ def extract_text(self, image: Image, lang: Languages):
132
+ extracted_text = pytesseract.image_to_string(image, lang=lang.value)
133
+ return ' '.join(extracted_text.split())
134
+
135
+ def extract_text_from_pdf(self, pdf_file, lang: Languages):
136
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
137
+ text = "".join(page.extract_text() or "" for page in pdf_reader.pages)
138
+ return text
139
+
140
+ def extract_key_details(self, text):
141
+ details = {"dates": set(), "names": set(), "locations": set()}
142
+ date_pattern = r'\b(?:\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}|\d{4})\b'
143
+ details['dates'] = set(re.findall(date_pattern, text))
144
+ entities = nlp(text)
145
+
146
+ for entity in entities:
147
+ if "PER" in entity['entity']:
148
+ details['names'].add(entity['word'])
149
+ elif "LOC" in entity['entity']:
150
+ details['locations'].add(entity['word'])
151
+
152
+ return details
153
+
154
+ # UI Layout
155
+ st.title("πŸ“œ OCR & Historical Context Analyzer")
156
+ st.markdown("Extract text from images and PDFs, analyze named entities, and retrieve historical context.")
157
+
158
+ col1, col2 = st.columns([1, 2])
159
+
160
+ with col1:
161
+ selected_os = st.selectbox("πŸ–₯️ Select your OS", [OS.Windows, OS.Mac], format_func=lambda x: x.name)
162
+ selected_lang = st.selectbox("🌍 Select language", list(Languages), format_func=lambda x: x.name)
163
+ uploaded_file = st.file_uploader("πŸ“‚ Upload an image or PDF", type=["png", "jpg", "jpeg", "pdf"])
164
+
165
+ if uploaded_file:
166
+ ir = ImageReader(selected_os)
167
+ extracted_text = ""
168
+ if uploaded_file.type in ["image/png", "image/jpeg"]:
169
+ image = Image.open(uploaded_file)
170
+ st.image(image, caption="Uploaded Image", use_column_width=True)
171
+ extracted_text = ir.extract_text(image, selected_lang)
172
+ else:
173
+ extracted_text = ir.extract_text_from_pdf(uploaded_file, selected_lang)
174
+
175
+ st.markdown("### πŸ“ Extracted Text:")
176
+ st.markdown(style_text(extracted_text), unsafe_allow_html=True)
177
+
178
+ key_details = ir.extract_key_details(extracted_text)
179
+ st.markdown("### πŸ” Extracted Key Details")
180
+ st.write(f"**πŸ“… Dates:** {', '.join(key_details['dates']) if key_details['dates'] else 'None found'}")
181
+ st.write(f"**πŸ‘€ Names:** {', '.join(key_details['names']) if key_details['names'] else 'None found'}")
182
+ st.write(f"**πŸ“ Locations:** {', '.join(key_details['locations']) if key_details['locations'] else 'None found'}")
183
+
184
+ combined_terms = ' '.join(key_details['dates'].union(key_details['locations']).union(key_details['names']))
185
+ historical_context = generate_historical_context_nvidia(combined_terms)
186
+ st.markdown("### πŸ›οΈ Historical Context")
187
+ st.markdown(style_text(historical_context), unsafe_allow_html=True)
188
+
189
+ st.markdown("### 🌐 Search the Web")
190
+ search_query = st.text_input("Enter a keyword or phrase:")
191
+ if search_query:
192
+ search_results = generate_historical_context_nvidia(search_query)
193
+ st.markdown(style_text(search_results), unsafe_allow_html=True)
194
+
195
+ related_docs = find_related_documents(combined_terms)
196
+ st.markdown("### πŸ“š Related Historical Documents")
197
+ for link in related_docs:
198
+ st.markdown(f"[πŸ”— {link}]({link})")
199
+
200
+ st.markdown("### πŸ—ΊοΈ Map of Key Locations")
201
+ map_center = [10.0, 10.0]
202
+ map_obj = folium.Map(location=map_center, zoom_start=2)
203
+
204
+ for loc in key_details['locations']:
205
+ coords = geocode_location(loc)
206
+ if coords:
207
+ folium.Marker(coords, popup=loc).add_to(map_obj)
208
+
209
+ st_folium(map_obj, width=700, height=500)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pillow
3
+ pytesseract
4
+ pypdf2
5
+ transformers
6
+ openai
7
+ wikipedia-api
8
+ geopy
9
+ folium
10
+ streamlit-folium
setup.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ #!/bin/bash
2
+ apt-get update && apt-get install -y tesseract-ocr libtesseract-dev