Svngoku commited on
Commit
e2c6744
·
verified ·
1 Parent(s): 96d9245
Files changed (1) hide show
  1. app.py +10 -12
app.py CHANGED
@@ -75,26 +75,22 @@ class OCRProcessor:
75
 
76
  def _get_file_content(self, file_input: Union[str, object]) -> bytes:
77
  try:
78
- if isinstance(file_input, str): # File path
 
 
 
 
 
79
  with open(file_input, "rb") as f:
80
  return f.read()
81
  elif hasattr(file_input, 'read'): # File-like object
82
  return file_input.read()
83
  else:
84
- raise ValueError("Invalid file input: must be a path or file-like object")
85
  except Exception as e:
86
  logger.error(f"Error getting file content: {str(e)}")
87
  raise
88
 
89
- def _fetch_url_content(self, url: str) -> bytes:
90
- try:
91
- response = requests.get(url, timeout=10)
92
- response.raise_for_status()
93
- return response.content
94
- except requests.RequestException as e:
95
- logger.error(f"Error fetching URL {url}: {str(e)}")
96
- raise
97
-
98
  def ocr_pdf_url(self, pdf_url: str) -> str:
99
  logger.info(f"Processing PDF URL: {pdf_url}")
100
  try:
@@ -182,6 +178,8 @@ class OCRProcessor:
182
  content = chat_response.choices[0].message.content if chat_response.choices else "{}"
183
  try:
184
  response_dict = json.loads(content)
 
 
185
  except json.JSONDecodeError:
186
  logger.error("Invalid JSON response from chat API")
187
  response_dict = {}
@@ -204,7 +202,7 @@ class OCRProcessor:
204
  @staticmethod
205
  def _format_structured_response(file_path: str, content: Dict) -> str:
206
  languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
207
- valid_langs = [l for l in content.get("languages", [DEFAULT_LANGUAGE]) if l in languages.values()]
208
 
209
  response = {
210
  "file_name": Path(file_path).name,
 
75
 
76
  def _get_file_content(self, file_input: Union[str, object]) -> bytes:
77
  try:
78
+ if isinstance(file_input, str) and file_input.startswith(("http://", "https://")):
79
+ # Handle URLs
80
+ response = requests.get(file_input, timeout=10)
81
+ response.raise_for_status()
82
+ return response.content
83
+ elif isinstance(file_input, str): # File path
84
  with open(file_input, "rb") as f:
85
  return f.read()
86
  elif hasattr(file_input, 'read'): # File-like object
87
  return file_input.read()
88
  else:
89
+ raise ValueError("Invalid file input: must be a URL, path, or file-like object")
90
  except Exception as e:
91
  logger.error(f"Error getting file content: {str(e)}")
92
  raise
93
 
 
 
 
 
 
 
 
 
 
94
  def ocr_pdf_url(self, pdf_url: str) -> str:
95
  logger.info(f"Processing PDF URL: {pdf_url}")
96
  try:
 
178
  content = chat_response.choices[0].message.content if chat_response.choices else "{}"
179
  try:
180
  response_dict = json.loads(content)
181
+ if isinstance(response_dict, list): # Handle unexpected list response
182
+ response_dict = response_dict[0] if response_dict else {}
183
  except json.JSONDecodeError:
184
  logger.error("Invalid JSON response from chat API")
185
  response_dict = {}
 
202
  @staticmethod
203
  def _format_structured_response(file_path: str, content: Dict) -> str:
204
  languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
205
+ valid_langs = [l for l in (content.get("languages") or [DEFAULT_LANGUAGE]) if l in languages.values()]
206
 
207
  response = {
208
  "file_name": Path(file_path).name,