Spaces:
Running
Running
:'(
Browse files
app.py
CHANGED
@@ -75,26 +75,22 @@ class OCRProcessor:
|
|
75 |
|
76 |
def _get_file_content(self, file_input: Union[str, object]) -> bytes:
|
77 |
try:
|
78 |
-
if isinstance(file_input, str)
|
|
|
|
|
|
|
|
|
|
|
79 |
with open(file_input, "rb") as f:
|
80 |
return f.read()
|
81 |
elif hasattr(file_input, 'read'): # File-like object
|
82 |
return file_input.read()
|
83 |
else:
|
84 |
-
raise ValueError("Invalid file input: must be a path or file-like object")
|
85 |
except Exception as e:
|
86 |
logger.error(f"Error getting file content: {str(e)}")
|
87 |
raise
|
88 |
|
89 |
-
def _fetch_url_content(self, url: str) -> bytes:
|
90 |
-
try:
|
91 |
-
response = requests.get(url, timeout=10)
|
92 |
-
response.raise_for_status()
|
93 |
-
return response.content
|
94 |
-
except requests.RequestException as e:
|
95 |
-
logger.error(f"Error fetching URL {url}: {str(e)}")
|
96 |
-
raise
|
97 |
-
|
98 |
def ocr_pdf_url(self, pdf_url: str) -> str:
|
99 |
logger.info(f"Processing PDF URL: {pdf_url}")
|
100 |
try:
|
@@ -182,6 +178,8 @@ class OCRProcessor:
|
|
182 |
content = chat_response.choices[0].message.content if chat_response.choices else "{}"
|
183 |
try:
|
184 |
response_dict = json.loads(content)
|
|
|
|
|
185 |
except json.JSONDecodeError:
|
186 |
logger.error("Invalid JSON response from chat API")
|
187 |
response_dict = {}
|
@@ -204,7 +202,7 @@ class OCRProcessor:
|
|
204 |
@staticmethod
|
205 |
def _format_structured_response(file_path: str, content: Dict) -> str:
|
206 |
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
207 |
-
valid_langs = [l for l in content.get("languages"
|
208 |
|
209 |
response = {
|
210 |
"file_name": Path(file_path).name,
|
|
|
75 |
|
76 |
def _get_file_content(self, file_input: Union[str, object]) -> bytes:
|
77 |
try:
|
78 |
+
if isinstance(file_input, str) and file_input.startswith(("http://", "https://")):
|
79 |
+
# Handle URLs
|
80 |
+
response = requests.get(file_input, timeout=10)
|
81 |
+
response.raise_for_status()
|
82 |
+
return response.content
|
83 |
+
elif isinstance(file_input, str): # File path
|
84 |
with open(file_input, "rb") as f:
|
85 |
return f.read()
|
86 |
elif hasattr(file_input, 'read'): # File-like object
|
87 |
return file_input.read()
|
88 |
else:
|
89 |
+
raise ValueError("Invalid file input: must be a URL, path, or file-like object")
|
90 |
except Exception as e:
|
91 |
logger.error(f"Error getting file content: {str(e)}")
|
92 |
raise
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def ocr_pdf_url(self, pdf_url: str) -> str:
|
95 |
logger.info(f"Processing PDF URL: {pdf_url}")
|
96 |
try:
|
|
|
178 |
content = chat_response.choices[0].message.content if chat_response.choices else "{}"
|
179 |
try:
|
180 |
response_dict = json.loads(content)
|
181 |
+
if isinstance(response_dict, list): # Handle unexpected list response
|
182 |
+
response_dict = response_dict[0] if response_dict else {}
|
183 |
except json.JSONDecodeError:
|
184 |
logger.error("Invalid JSON response from chat API")
|
185 |
response_dict = {}
|
|
|
202 |
@staticmethod
|
203 |
def _format_structured_response(file_path: str, content: Dict) -> str:
|
204 |
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
205 |
+
valid_langs = [l for l in (content.get("languages") or [DEFAULT_LANGUAGE]) if l in languages.values()]
|
206 |
|
207 |
response = {
|
208 |
"file_name": Path(file_path).name,
|