Jimmy0866 commited on
Commit
71fadee
·
verified ·
1 Parent(s): 3bcb0b8

support OCR

Files changed (2) hide show
  1. README.md +29 -40
  2. pdf2text.py +29 -342
README.md CHANGED
@@ -1,62 +1,51 @@
1
- ---
2
- title: DocSummarizer_Jimmy
3
- emoji: 📝
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: "4.16.0"
8
- app_file: app.py
9
- pinned: true
10
- ---
11
 
12
- # DocSummarizer_Jimmy
13
 
14
- 這是一個簡易的文件摘要工具,支援以下功能:
15
 
16
- - 輸入純文字進行摘要
17
- - 📄 上傳 PDF,自動擷取文字並產生摘要
18
- - 🧠 使用 `pszemraj/bart-large-summary-map-reduce` 模型
19
- - 📁 支援範例檔案(放在 `examples/` 資料夾)
20
- - 🚀 Gradio 網頁介面即時輸出摘要結果
21
 
22
  ---
23
 
24
- ## 🔧 使用方式
25
-
26
- ### 本地端執行(建議使用 Python 3.10+)
27
 
 
28
  ```bash
29
- pip install -r requirements.txt
30
  python app.py
31
  ```
32
 
33
- ### 📁 範例檔案
 
 
34
 
35
- `.txt` 文件放置於 `examples/` 資料夾,Gradio 介面會自動載入並顯示。
36
-
37
- 你也可以上傳 PDF 或直接輸入文字。
38
 
39
  ---
40
 
41
- ## 📦 檔案結構
42
 
43
  ```bash
44
- .
45
- ├── app.py # 主應用程式(Gradio UI)
46
- ├── aggregate.py # 多段摘要彙整模組
47
- ├── summarize.py # 單段文字摘要處理
48
- ├── pdf2text.py # PDF OCR / 文字擷取處理
49
- ├── utils.py # 工具函式
50
- ├── requirements.txt # 所需套件列表
51
- ├── examples/
52
- │ └── example1.txt # 範例檔案
53
- └── README.md # 說明文件
54
  ```
55
 
56
  ---
57
 
58
- ## Credits
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- - 🤖 模型:`pszemraj/bart-large-summary-map-reduce`
61
- - 📦 前端:Gradio Blocks
62
- - 👨‍💻 Author: Jimmy
 
1
+ # DocSummarizer
 
 
 
 
 
 
 
 
 
2
 
3
+ 本工具可將 PDF 文件自動擷取內容並摘要,支援兩種文字擷取模式:
4
 
5
+ ## ✅ 功能特色
6
 
7
+ - 📄 支援 PDF 檔文字擷取
8
+ - 🔍 可選「文字擷取」或「OCR 模式」
9
+ - 🤖 利用 BART 模型進行摘要
10
+ - 🌐 Gradio 介面操作簡便
 
11
 
12
  ---
13
 
14
+ ## 🧑‍💻 操作方式
 
 
15
 
16
+ 1. 啟動應用:
17
  ```bash
 
18
  python app.py
19
  ```
20
 
21
+ 2. 上傳 PDF 後選擇擷取模式:
22
+ - `simple`:適用於文字可複製的 PDF
23
+ - `ocr`:適用於圖片 PDF 或文字亂碼
24
 
25
+ 3. 查看並修改匯入文字後按下「Generate Summary」
 
 
26
 
27
  ---
28
 
29
+ ## 📦 依賴安裝
30
 
31
  ```bash
32
+ pip install -r requirements.txt
33
+ sudo apt install tesseract-ocr tesseract-ocr-chi-tra poppler-utils
 
 
 
 
 
 
 
 
34
  ```
35
 
36
  ---
37
 
38
+ ## 📁 檔案結構
39
+
40
+ ```
41
+ ├── app.py # 主介面
42
+ ├── pdf2text.py # PDF 文字擷取
43
+ ├── summarize.py # 摘要產生邏輯
44
+ ├── requirements.txt
45
+ ├── examples/
46
+ │ └── example1.txt
47
+ ```
48
+
49
+ ---
50
 
51
+ Jimmy 製作
 
 
pdf2text.py CHANGED
@@ -1,346 +1,33 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- pdf2text.py - convert pdf files to text files using OCR
4
- """
5
- import logging
6
  import os
7
- import re
8
- import shutil
9
- import time
10
- from datetime import date
11
- from os.path import join
12
- from pathlib import Path
13
-
14
- logging.basicConfig(
15
- level=logging.INFO,
16
- format="%(asctime)s %(levelname)s %(message)s",
17
- datefmt="%m/%d/%Y %I:%M:%S",
18
- )
19
-
20
-
21
- os.environ["USE_TORCH"] = "1"
22
-
23
- from cleantext import clean
24
- from doctr.io import DocumentFile
25
- from doctr.models import ocr_predictor
26
- from spellchecker import SpellChecker
27
-
28
-
29
- def simple_rename(filepath, target_ext=".txt"):
30
- """simple_rename - get a new str to rename a file"""
31
- _fp = Path(filepath)
32
- basename = _fp.stem
33
- return f"OCR_{basename}_{target_ext}"
34
-
35
-
36
- def rm_local_text_files(name_contains="RESULT_"):
37
- """
38
- rm_local_text_files - remove local text files
39
- """
40
- files = [
41
- f
42
- for f in Path.cwd().iterdir()
43
- if f.is_file() and f.suffix == ".txt" and name_contains in f.name
44
- ]
45
- logging.info(f"removing {len(files)} text files")
46
- for f in files:
47
- os.remove(f)
48
- logging.info("done")
49
-
50
-
51
- def corr(
52
- s: str,
53
- add_space_when_numerics=False,
54
- exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
55
- ) -> str:
56
- """corrects spacing in a string
57
-
58
- Args:
59
- s (str): the string to correct
60
- add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
61
- exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
62
-
63
- Returns:
64
- str: the corrected string
65
- """
66
- if add_space_when_numerics:
67
- s = re.sub(r"(\d)\.(\d)", r"\1. \2", s)
68
-
69
- s = re.sub(r"\s+", " ", s)
70
- s = re.sub(r'\s([?.!"](?:\s|$))', r"\1", s)
71
-
72
- # fix space before apostrophe
73
- s = re.sub(r"\s\'", r"'", s)
74
- # fix space after apostrophe
75
- s = re.sub(r"'\s", r"'", s)
76
- # fix space before comma
77
- s = re.sub(r"\s,", r",", s)
78
-
79
- for e in exceptions:
80
- expected_sub = re.sub(r"\s", "", e)
81
- s = s.replace(expected_sub, e)
82
-
83
- return s
84
-
85
-
86
- def fix_punct_spaces(string: str) -> str:
87
- """
88
- fix_punct_spaces - fix spaces around punctuation
89
-
90
- :param str string: input string
91
- :return str: string with spaces fixed
92
- """
93
-
94
- fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
95
- string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
96
- string = string.replace(" ' ", "'")
97
- string = string.replace(' " ', '"')
98
- return string.strip()
99
-
100
-
101
- def clean_OCR(ugly_text: str) -> str:
102
- """
103
- clean_OCR - clean up the OCR text
104
-
105
- :param str ugly_text: input text to be cleaned
106
- :return str: cleaned text
107
- """
108
- # Remove all the newlines.
109
- cleaned_text = ugly_text.replace("\n", " ")
110
- # Remove all the tabs.
111
- cleaned_text = cleaned_text.replace("\t", " ")
112
- # Remove all the double spaces.
113
- cleaned_text = cleaned_text.replace(" ", " ")
114
- # Remove all the spaces at the beginning of the text.
115
- cleaned_text = cleaned_text.lstrip()
116
- # remove all instances of "- " and " - "
117
- cleaned_text = cleaned_text.replace("- ", "")
118
- cleaned_text = cleaned_text.replace(" -", "")
119
- return fix_punct_spaces(cleaned_text)
120
-
121
-
122
- def move2completed(
123
- from_dir, filename, new_folder: str = "completed", verbose: bool = False
124
- ):
125
- """
126
- move2completed - move a file to a new folder
127
- """
128
- old_filepath = join(from_dir, filename)
129
-
130
- new_filedirectory = join(from_dir, new_folder)
131
-
132
- if not os.path.isdir(new_filedirectory):
133
- os.mkdir(new_filedirectory)
134
- if verbose:
135
- print("created new directory for files at: \n", new_filedirectory)
136
- new_filepath = join(new_filedirectory, filename)
137
 
 
 
138
  try:
139
- shutil.move(old_filepath, new_filepath)
140
- logging.info("successfully moved the file {} to */completed.".format(filename))
141
- except:
142
- logging.info(
143
- "ERROR! unable to move file to \n{}. Please investigate".format(
144
- new_filepath
145
- )
146
- )
147
-
148
-
149
- custom_replace_list = {
150
- "t0": "to",
151
- "'$": "'s",
152
- ",,": ", ",
153
- "_ ": " ",
154
- " '": "'",
155
- }
156
-
157
- replace_corr_exceptions = {
158
- "i. e.": "i.e.",
159
- "e. g.": "e.g.",
160
- "e. g": "e.g.",
161
- " ,": ",",
162
- }
163
-
164
-
165
- spell = SpellChecker()
166
-
167
-
168
- def check_word_spelling(word: str) -> bool:
169
- """
170
- check_word_spelling - check the spelling of a word
171
-
172
- Args:
173
- word (str): word to check
174
-
175
- Returns:
176
- bool: True if word is spelled correctly, False if not
177
- """
178
-
179
- misspelled = spell.unknown([word])
180
-
181
- return len(misspelled) == 0
182
-
183
-
184
- def eval_and_replace(text: str, match_token: str = "- ") -> str:
185
- """
186
- eval_and_replace - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
187
-
188
- Args:
189
- text (str): text to evaluate
190
- match_token (str, optional): token to replace. Defaults to "- ".
191
-
192
- Returns:
193
- str: text with replaced tokens
194
- """
195
-
196
- if match_token not in text:
197
- return text
198
- else:
199
- while True:
200
- full_before_text = text.split(match_token, maxsplit=1)[0]
201
- before_text = [
202
- char for char in full_before_text.split()[-1] if char.isalpha()
203
- ]
204
- before_text = "".join(before_text)
205
- full_after_text = text.split(match_token, maxsplit=1)[-1]
206
- after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
207
- after_text = "".join(after_text)
208
- full_text = before_text + after_text
209
- if check_word_spelling(full_text):
210
- text = full_before_text + full_after_text
211
- else:
212
- text = full_before_text + " " + full_after_text
213
- if match_token not in text:
214
- break
215
- return text
216
-
217
-
218
- def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
219
- """
220
- cleantxt_ocr - clean text from OCR
221
-
222
- https://pypi.org/project/clean-text/
223
- Args:
224
- ugly_text (str): text to clean
225
- lower (bool, optional): lowercase text. Defaults to False.
226
- lang (str, optional): language of text. Defaults to "en".
227
-
228
- Returns:
229
- str: cleaned text
230
- """
231
-
232
- cleaned_text = clean(
233
- ugly_text,
234
- fix_unicode=True, # fix various unicode errors
235
- to_ascii=True, # transliterate to closest ASCII representation
236
- lower=lower, # lowercase text
237
- no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
238
- no_urls=True, # replace all URLs with a special token
239
- no_emails=True, # replace all email addresses with a special token
240
- no_phone_numbers=True, # replace all phone numbers with a special token
241
- no_numbers=False, # replace all numbers with a special token
242
- no_digits=False, # replace all digits with a special token
243
- no_currency_symbols=False, # replace all currency symbols with a special token
244
- no_punct=False, # remove punctuations
245
- replace_with_punct="", # instead of removing punctuations you may replace them
246
- replace_with_url="this url",
247
- replace_with_email="this email",
248
- replace_with_phone_number="this phone number",
249
- lang=lang, # set to 'de' for German special handling
250
- )
251
-
252
- return cleaned_text
253
-
254
-
255
- def format_ocr_out(OCR_data):
256
- """format OCR output to text"""
257
- if isinstance(OCR_data, list):
258
- text = " ".join(OCR_data)
259
- else:
260
- text = str(OCR_data)
261
- _clean = cleantxt_ocr(text)
262
- return corr(_clean)
263
-
264
-
265
- def postprocess(text: str) -> str:
266
- """to be used after recombining the lines"""
267
-
268
- proc = corr(cleantxt_ocr(text))
269
-
270
- for k, v in custom_replace_list.items():
271
- proc = proc.replace(str(k), str(v))
272
-
273
- proc = corr(proc)
274
-
275
- for k, v in replace_corr_exceptions.items():
276
- proc = proc.replace(str(k), str(v))
277
-
278
- return eval_and_replace(proc)
279
-
280
-
281
- def result2text(result, as_text=False) -> str or list:
282
- """Convert OCR result to text"""
283
-
284
- full_doc = []
285
- for i, page in enumerate(result.pages, start=1):
286
  text = ""
287
- for block in page.blocks:
288
- text += "\n\t"
289
- for line in block.lines:
290
- for word in line.words:
291
- # print(dir(word))
292
- text += word.value + " "
293
- full_doc.append(text)
294
-
295
- return "\n".join(full_doc) if as_text else full_doc
296
-
297
-
298
- def convert_PDF_to_Text(
299
- PDF_file,
300
- ocr_model=None,
301
- max_pages: int = 20,
302
- ) -> str:
303
- """
304
- convert_PDF_to_Text - convert a PDF file to text
305
-
306
- :param str PDF_file: path to PDF file
307
- :param ocr_model: model to use for OCR, defaults to None (uses the default model)
308
- :param int max_pages: maximum number of pages to process, defaults to 20
309
- :return str: text from PDF
310
- """
311
- st = time.perf_counter()
312
- PDF_file = Path(PDF_file)
313
- ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
314
- logging.info(f"starting OCR on {PDF_file.name}")
315
- doc = DocumentFile.from_pdf(PDF_file)
316
- truncated = False
317
- if len(doc) > max_pages:
318
- logging.warning(
319
- f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
320
- )
321
- doc = doc[:max_pages]
322
- truncated = True
323
-
324
- # Analyze
325
- logging.info(f"running OCR on {len(doc)} pages")
326
- result = ocr_model(doc)
327
- raw_text = result2text(result)
328
- proc_text = [format_ocr_out(r) for r in raw_text]
329
- fin_text = [postprocess(t) for t in proc_text]
330
-
331
- ocr_results = "\n\n".join(fin_text)
332
-
333
- fn_rt = time.perf_counter() - st
334
-
335
- logging.info("OCR complete")
336
-
337
- results_dict = {
338
- "num_pages": len(doc),
339
- "runtime": round(fn_rt, 2),
340
- "date": str(date.today()),
341
- "converted_text": ocr_results,
342
- "truncated": truncated,
343
- "length": len(ocr_results),
344
- }
345
-
346
- return results_dict
 
1
+ from pdf2image import convert_from_path
2
+ import pytesseract
3
+ from PyPDF2 import PdfReader
4
+ import tempfile
 
5
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ def extract_text_simple(pdf_path: str) -> str:
8
+ """使用 PyPDF2 直接提取 PDF 純文字"""
9
  try:
10
+ with open(pdf_path, "rb") as f:
11
+ reader = PdfReader(f)
12
+ return "\n\n".join(page.extract_text() or "" for page in reader.pages)
13
+ except Exception as e:
14
+ return f" PDF 讀取錯誤: {e}"
15
+
16
+ def extract_text_ocr(pdf_path: str) -> str:
17
+ """使用 OCR 擷取 PDF 的圖片並辨識成文字"""
18
+ try:
19
+ images = convert_from_path(pdf_path, dpi=300)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  text = ""
21
+ for i, img in enumerate(images):
22
+ gray = img.convert('L')
23
+ page_text = pytesseract.image_to_string(gray, lang='chi_tra')
24
+ text += f"\n\n--- Page {i+1} ---\n\n" + page_text
25
+ return text
26
+ except Exception as e:
27
+ return f"❌ OCR 擷取失敗: {e}"
28
+
29
+ def extract_text(pdf_path: str, mode: str = "simple") -> str:
30
+ """依模式選擇擷取方式:simple 或 ocr"""
31
+ if mode == "ocr":
32
+ return extract_text_ocr(pdf_path)
33
+ return extract_text_simple(pdf_path)