Jimmy0866 commited on
Commit
d1f503e
·
verified ·
1 Parent(s): 5c5e13c

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +69 -8
  2. app.py +57 -649
  3. pdf2text.py +45 -25
README.md CHANGED
@@ -1,10 +1,71 @@
 
 
 
 
 
1
  ---
2
- title: DocSummarizer_Jimmy
3
- emoji: 📄
4
- colorFrom: indigo
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: "4.28.3"
8
- app_file: app.py
9
- pinned: false
 
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # DocSummarizer_Jimmy
3
+
4
+ 🚀 這是一個簡單易用的 PDF 與文字文件摘要工具,支援 OCR 模式與簡單文字轉換模式,提供使用者選擇性處理繁體中文文件,並以 Gradio 介面展示。
5
+
6
  ---
7
+
8
+ ## 🧰 功能特色
9
+
10
+ - ✅ 上傳 PDF,自動擷取文字或使用 OCR(適用掃描圖像型 PDF)
11
+ - ✅ 上傳 TXT,進行自動摘要
12
+ - ✅ 提供範例文件供測試(位於 `examples/` 資料夾)
13
+ - ✅ OCR 模式與簡單模式自由切換
14
+ - ✅ 中文介面與多語摘要模型支援
15
+
16
  ---
17
+
18
+ ## 📂 專案結構
19
+
20
+ ```
21
+ DocSummarizer_Jimmy/
22
+ ├── app.py # 主程式
23
+ ├── summarize.py # 摘要處理模組
24
+ ├── pdf2text.py # OCR 與 PDF 處理
25
+ ├── utils.py # 工具模組
26
+ ├── requirements.txt # 安裝依賴
27
+ ├── examples/
28
+ │ └── example1.txt # 範例測試文件
29
+ ```
30
+
31
+ ---
32
+
33
+ ## ⚙️ 使用方式
34
+
35
+ 1. 安裝依賴:
36
+
37
+ ```bash
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ 2. 執行 Gradio 應用:
42
+
43
+ ```bash
44
+ python app.py
45
+ ```
46
+
47
+ 3. 開啟瀏覽器後依需求:
48
+
49
+ - 選擇上傳 `PDF` 或 `TXT`
50
+ - 選擇 OCR 模式或簡單模式
51
+ - 點擊「Generate Summary」生成摘要
52
+ - 選用範例檔案進行測試(預設載入 example1.txt)
53
+
54
+ ---
55
+
56
+ ## 🧠 使用模型
57
+
58
+ - 🤖 `pszemraj/bart-large-summary-map-reduce`:適用於長文本摘要
59
+ - 📄 `doctr`:OCR 模型,用於解析圖像 PDF
60
+
61
+ ---
62
+
63
+ ## 📝 備註
64
+
65
+ - 本工具針對繁體中文支援,OCR 輸出預設為 UTF-8。
66
+ - 使用掃描型 PDF 時請務必勾選 OCR 模式。
67
+ - 若遇模型下載失敗,請檢查網路或手動下載 HuggingFace 模型。
68
+
69
+ ---
70
+
71
+ Jimmy 工程師專案 — 持續優化中。歡迎反饋建議。
app.py CHANGED
@@ -1,667 +1,75 @@
1
- """
2
- app.py - the main module for the gradio app for summarization
3
 
4
- Usage:
5
- app.py [-h] [--share] [-m MODEL] [-nb ADD_BEAM_OPTION] [-batch TOKEN_BATCH_OPTION]
6
- [-level {DEBUG,INFO,WARNING,ERROR}]
7
- Details:
8
- python app.py --help
9
-
10
- Environment Variables:
11
- USE_TORCH (str): whether to use torch (1) or not (0)
12
- TOKENIZERS_PARALLELISM (str): whether to use parallelism (true) or not (false)
13
- Optional Environment Variables:
14
- APP_MAX_WORDS (int): the maximum number of words to use for summarization
15
- APP_OCR_MAX_PAGES (int): the maximum number of pages to use for OCR
16
- """
17
-
18
- import argparse
19
- import contextlib
20
- import gc
21
- import logging
22
  import os
23
- import pprint as pp
24
- import random
25
- import time
26
- from pathlib import Path
27
-
28
- os.environ["USE_TORCH"] = "1"
29
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
30
-
31
- logging.basicConfig(
32
- level=logging.INFO,
33
- format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
34
- datefmt="%Y-%b-%d %H:%M:%S",
35
- )
36
-
37
  import gradio as gr
38
- import nltk
39
- nltk.download('punkt') # 自動下載 tokenizer 資源
40
- import torch
41
- from cleantext import clean
42
- from doctr.models import ocr_predictor
43
-
44
- from aggregate import BatchAggregator
45
- from pdf2text import convert_pdf_to_text
46
- from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
47
- from utils import (
48
- contraction_aware_tokenize,
49
- extract_batches,
50
- load_example_filenames,
51
- remove_stagnant_files,
52
- remove_stopwords,
53
- saves_summary,
54
- textlist2html,
55
- truncate_word_count,
56
- )
57
-
58
- _here = Path(__file__).parent
59
-
60
- nltk.download("punkt", force=True, quiet=True)
61
- nltk.download("popular", force=True, quiet=True)
62
-
63
- # Constants & Globals
64
- MODEL_OPTIONS = [
65
- "BEE-spoke-data/pegasus-x-base-synthsumm_open-16k",
66
- "pszemraj/long-t5-tglobal-base-sci-simplify",
67
- "pszemraj/long-t5-tglobal-base-16384-book-summary",
68
- "pszemraj/long-t5-tglobal-base-summary-souffle-16384-loD",
69
- "pszemraj/pegasus-x-large-book_synthsumm",
70
- "pszemraj/pegasus-x-large-book-summary",
71
- ] # models users can choose from
72
- BEAM_OPTIONS = [2, 3, 4] # beam sizes users can choose from
73
- TOKEN_BATCH_OPTIONS = [
74
- 1024,
75
- 1536,
76
- 2048,
77
- 2560,
78
- 3072,
79
- ] # token batch sizes users can choose from
80
-
81
- SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
82
- AGGREGATE_MODEL = "pszemraj/bart-large-summary-map-reduce" # map-reduce model
83
-
84
- # if duplicating space: uncomment this line to adjust the max words
85
- # os.environ["APP_MAX_WORDS"] = str(2048) # set the max words to 2048
86
- # os.environ["APP_OCR_MAX_PAGES"] = str(40) # set the max pages to 40
87
- # os.environ["APP_AGG_FORCE_CPU"] = str(1) # force cpu for aggregation
88
-
89
- aggregator = BatchAggregator(
90
- AGGREGATE_MODEL, force_cpu=os.environ.get("APP_AGG_FORCE_CPU", False)
91
- )
92
-
93
-
94
- def aggregate_text(
95
- summary_text: str,
96
- text_file: gr.File = None,
97
- ) -> str:
98
- """
99
- Aggregate the text from the batches.
100
-
101
- NOTE: you should probably include the BatchAggregator object as a fn arg if using this code
102
-
103
- :param batches_html: The batches to aggregate, in html format
104
- :param text_file: The text file to append the aggregate summary to
105
- :return: The aggregate summary in html format
106
- """
107
- if summary_text is None or summary_text == SUMMARY_PLACEHOLDER:
108
- logging.error("No text provided. Make sure a summary has been generated first.")
109
- return "Error: No text provided. Make sure a summary has been generated first."
110
-
111
- try:
112
- extracted_batches = extract_batches(summary_text)
113
- except Exception as e:
114
- logging.info(summary_text)
115
- logging.info(f"the batches html is: {type(summary_text)}")
116
- return f"Error: unable to extract batches - check input: {e}"
117
- if not extracted_batches:
118
- logging.error("unable to extract batches - check input")
119
- return "Error: unable to extract batches - check input"
120
-
121
- out_path = None
122
- if text_file is not None:
123
- out_path = text_file.name # assuming name attribute stores the file path
124
-
125
- content_batches = [batch["content"] for batch in extracted_batches]
126
- full_summary = aggregator.infer_aggregate(content_batches)
127
-
128
- # if a path that exists is provided, append the summary with markdown formatting
129
- if out_path:
130
- out_path = Path(out_path)
131
-
132
- try:
133
- with open(out_path, "a", encoding="utf-8") as f:
134
- f.write("\n\n## Aggregate Summary\n\n")
135
- f.write(
136
- "- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
137
- )
138
- f.write(f"- Aggregation model: {aggregator.model_name}\n\n")
139
- f.write(f"{full_summary}\n\n")
140
- logging.info(f"Updated {out_path} with aggregate summary")
141
- except Exception as e:
142
- logging.error(f"unable to update {out_path} with aggregate summary: {e}")
143
-
144
- full_summary_html = f"""
145
- <div style="
146
- margin-bottom: 20px;
147
- font-size: 18px;
148
- line-height: 1.5em;
149
- color: #333;
150
- ">
151
- <h2 style="font-size: 22px; color: #555;">Aggregate Summary:</h2>
152
- <p style="white-space: pre-line;">{full_summary}</p>
153
- </div>
154
- """
155
- return full_summary_html
156
-
157
-
158
- def predict(
159
- input_text: str,
160
- model_name: str,
161
- token_batch_length: int = 1024,
162
- empty_cache: bool = True,
163
- **settings,
164
- ) -> list:
165
- """
166
- predict - helper fn to support multiple models for summarization at once
167
-
168
- :param str input_text: the input text to summarize
169
- :param str model_name: model name to use
170
- :param int token_batch_length: the length of the token batches to use
171
- :param bool empty_cache: whether to empty the cache before loading a new= model
172
- :return: list of dicts with keys "summary" and "score"
173
- """
174
- if torch.cuda.is_available() and empty_cache:
175
- torch.cuda.empty_cache()
176
-
177
- model, tokenizer = load_model_and_tokenizer(model_name)
178
- summaries = summarize_via_tokenbatches(
179
- input_text,
180
- model,
181
- tokenizer,
182
- batch_length=token_batch_length,
183
- **settings,
184
- )
185
-
186
- del model
187
- del tokenizer
188
- gc.collect()
189
-
190
- return summaries
191
-
192
-
193
- def proc_submission(
194
- input_text: str,
195
- model_name: str,
196
- num_beams: int,
197
- token_batch_length: int,
198
- length_penalty: float,
199
- repetition_penalty: float,
200
- no_repeat_ngram_size: int,
201
- predrop_stopwords: bool,
202
- max_input_length: int = 6144,
203
- ):
204
- """
205
- proc_submission - a helper function for the gradio module to process submissions
206
-
207
- Args:
208
- input_text (str): the input text to summarize
209
- model_name (str): the hf model tag of the model to use
210
- num_beams (int): the number of beams to use
211
- token_batch_length (int): the length of the token batches to use
212
- length_penalty (float): the length penalty to use
213
- repetition_penalty (float): the repetition penalty to use
214
- no_repeat_ngram_size (int): the no repeat ngram size to use
215
- predrop_stopwords (bool): whether to pre-drop stopwords before truncating/summarizing
216
- max_input_length (int, optional): the maximum input length to use. Defaults to 6144.
217
-
218
- Note:
219
- the max_input_length is set to 6144 by default, but can be changed by setting the
220
- environment variable APP_MAX_WORDS to a different value.
221
-
222
- Returns:
223
- tuple (4): a tuple containing the following:
224
- """
225
-
226
- remove_stagnant_files() # clean up old files
227
- settings = {
228
- "length_penalty": float(length_penalty),
229
- "repetition_penalty": float(repetition_penalty),
230
- "no_repeat_ngram_size": int(no_repeat_ngram_size),
231
- "encoder_no_repeat_ngram_size": 4,
232
- "num_beams": int(num_beams),
233
- "min_length": 4,
234
- "max_length": int(token_batch_length // 4),
235
- "early_stopping": True,
236
- "do_sample": False,
237
- }
238
- max_input_length = int(os.environ.get("APP_MAX_WORDS", max_input_length))
239
- logging.info(
240
- f"max_input_length set to: {max_input_length}. pre-drop stopwords: {predrop_stopwords}"
241
- )
242
-
243
- st = time.perf_counter()
244
- history = {}
245
- cln_text = clean(input_text, lower=False)
246
- parsed_cln_text = remove_stopwords(cln_text) if predrop_stopwords else cln_text
247
- logging.info(
248
- f"pre-truncation word count: {len(contraction_aware_tokenize(parsed_cln_text))}"
249
- )
250
- truncation_validated = truncate_word_count(
251
- parsed_cln_text, max_words=max_input_length
252
- )
253
-
254
- if truncation_validated["was_truncated"]:
255
- model_input_text = truncation_validated["processed_text"]
256
- # create elaborate HTML warning
257
- input_wc = len(contraction_aware_tokenize(parsed_cln_text))
258
- msg = f"""
259
- <div style="background-color: #FFA500; color: white; padding: 20px;">
260
- <h3>Warning</h3>
261
- <p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/input_wc:.2f}% of the original text.</p>
262
- <p>Dropping stopwords is set to {predrop_stopwords}. If this is not what you intended, please validate the advanced settings.</p>
263
- </div>
264
- """
265
- logging.warning(msg)
266
- history["WARNING"] = msg
267
- else:
268
- model_input_text = truncation_validated["processed_text"]
269
- msg = None
270
-
271
- if len(input_text) < 50:
272
- # this is essentially a different case from the above
273
- msg = f"""
274
- <div style="background-color: #880808; color: white; padding: 20px;">
275
- <br>
276
- <img src="https://i.imgflip.com/7kadd9.jpg" alt="no text">
277
- <br>
278
- <h3>Error</h3>
279
- <p>Input text is too short to summarize. Detected {len(input_text)} characters.
280
- Please load text by selecting an example from the dropdown menu or by pasting text into the text box.</p>
281
- </div>
282
- """
283
- logging.warning(msg)
284
- logging.warning("RETURNING EMPTY STRING")
285
- history["WARNING"] = msg
286
-
287
- return msg, "<strong>No summary generated.</strong>", "", []
288
-
289
- _summaries = predict(
290
- input_text=model_input_text,
291
- model_name=model_name,
292
- token_batch_length=token_batch_length,
293
- **settings,
294
- )
295
- sum_text = [s["summary"][0].strip() + "\n" for s in _summaries]
296
- sum_scores = [
297
- f" - Batch Summary {i}: {round(s['summary_score'],4)}"
298
- for i, s in enumerate(_summaries)
299
- ]
300
-
301
- full_summary = textlist2html(sum_text)
302
- history["Summary Scores"] = "<br><br>"
303
- scores_out = "\n".join(sum_scores)
304
- rt = round((time.perf_counter() - st) / 60, 2)
305
- logging.info(f"Runtime: {rt} minutes")
306
- html = ""
307
- html += f"<p>Runtime: {rt} minutes with model: {model_name}</p>"
308
- if msg is not None:
309
- html += msg
310
-
311
- html += ""
312
-
313
- settings["remove_stopwords"] = predrop_stopwords
314
- settings["model_name"] = model_name
315
- saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
316
- return html, full_summary, scores_out, saved_file
317
-
318
 
319
- def load_single_example_text(
320
- example_path: str or Path,
321
- max_pages: int = 20,
322
- ) -> str:
323
- """
324
- load_single_example_text - loads a single example text file
325
 
326
- :param strorPath example_path: name of the example to load
327
- :param int max_pages: the maximum number of pages to load from a PDF
328
- :return str: the text of the example
329
- """
330
- global name_to_path, ocr_model
331
- full_ex_path = name_to_path[example_path]
332
- full_ex_path = Path(full_ex_path)
333
- if full_ex_path.suffix in [".txt", ".md"]:
334
- with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
335
- raw_text = f.read()
336
- text = clean(raw_text, lower=False)
337
- elif full_ex_path.suffix == ".pdf":
338
- logging.info(f"Loading PDF file {full_ex_path}")
339
- max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
340
- logging.info(f"max_pages set to: {max_pages}")
341
- conversion_stats = convert_PDF_to_Text(
342
- full_ex_path,
343
- ocr_model=ocr_model,
344
- max_pages=max_pages,
345
- )
346
- text = conversion_stats["converted_text"]
347
- else:
348
- logging.error(f"Unknown file type {full_ex_path.suffix}")
349
- text = "ERROR - check example path"
350
 
351
- return text
 
 
 
 
 
 
 
 
352
 
 
 
 
 
 
 
353
 
354
- def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
355
- """
356
- load_uploaded_file - loads a file uploaded by the user
357
 
358
- :param file_obj (POTENTIALLY list): Gradio file object inside a list
359
- :param int max_pages: the maximum number of pages to load from a PDF
360
- :param bool lower: whether to lowercase the text
361
- :return str: the text of the file
362
- """
363
- global ocr_model
364
- logger = logging.getLogger(__name__)
365
- # check if mysterious file object is a list
366
- if isinstance(file_obj, list):
367
- file_obj = file_obj[0]
368
- file_path = Path(file_obj.name)
369
  try:
370
- logger.info(f"Loading file:\t{file_path}")
371
- if file_path.suffix in [".txt", ".md"]:
372
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
373
- raw_text = f.read()
374
- text = clean(raw_text, lower=lower)
375
- elif file_path.suffix == ".pdf":
376
- logger.info(f"loading a PDF file: {file_path.name}")
377
- max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
378
- logger.info(f"max_pages is: {max_pages}. Starting conversion...")
379
- conversion_stats = convert_PDF_to_Text(
380
- file_path,
381
- ocr_model=ocr_model,
382
- max_pages=max_pages,
383
- )
384
- text = conversion_stats["converted_text"]
385
  else:
386
- logger.error(f"Unknown file type:\t{file_path.suffix}")
387
- text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
388
-
389
- return text
390
  except Exception as e:
391
- logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
392
- return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
393
-
394
-
395
- def parse_args():
396
- """arguments for the command line interface"""
397
- parser = argparse.ArgumentParser(
398
- description="Document Summarization with Long-Document Transformers - Demo",
399
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
400
- epilog="Runs a local-only web UI to summarize documents. pass --share for a public link to share.",
401
- )
402
-
403
- parser.add_argument(
404
- "--share",
405
- dest="share",
406
- action="store_true",
407
- help="Create a public link to share",
408
- )
409
- parser.add_argument(
410
- "-m",
411
- "--model",
412
- type=str,
413
- default=None,
414
- help=f"Add a custom model to the list of models: {pp.pformat(MODEL_OPTIONS, compact=True)}",
415
- )
416
- parser.add_argument(
417
- "-nb",
418
- "--add_beam_option",
419
- type=int,
420
- default=None,
421
- help=f"Add a beam search option to the demo UI options, default: {pp.pformat(BEAM_OPTIONS, compact=True)}",
422
- )
423
- parser.add_argument(
424
- "-batch",
425
- "--token_batch_option",
426
- type=int,
427
- default=None,
428
- help=f"Add a token batch size to the demo UI options, default: {pp.pformat(TOKEN_BATCH_OPTIONS, compact=True)}",
429
- )
430
- parser.add_argument(
431
- "-max_agg",
432
- "-2x",
433
- "--aggregator_beam_boost",
434
- dest="aggregator_beam_boost",
435
- action="store_true",
436
- help="Double the number of beams for the aggregator during beam search",
437
- )
438
- parser.add_argument(
439
- "-level",
440
- "--log_level",
441
- type=str,
442
- default="INFO",
443
- choices=["DEBUG", "INFO", "WARNING", "ERROR"],
444
- help="Set the logging level",
445
- )
446
-
447
- return parser.parse_args()
448
 
 
 
449
 
450
- if __name__ == "__main__":
451
- """main - the main function of the app"""
452
- logger = logging.getLogger(__name__)
453
- args = parse_args()
454
- logger.setLevel(args.log_level)
455
- logger.info(f"args: {pp.pformat(args.__dict__, compact=True)}")
456
 
457
- # add any custom options
458
- if args.model is not None:
459
- logger.info(f"Adding model {args.model} to the list of models")
460
- MODEL_OPTIONS.append(args.model)
461
- if args.add_beam_option is not None:
462
- logger.info(f"Adding beam search option {args.add_beam_option} to the list")
463
- BEAM_OPTIONS.append(args.add_beam_option)
464
- if args.token_batch_option is not None:
465
- logger.info(f"Adding token batch option {args.token_batch_option} to the list")
466
- TOKEN_BATCH_OPTIONS.append(args.token_batch_option)
467
-
468
- if args.aggregator_beam_boost:
469
- logger.info("Doubling aggregator num_beams")
470
- _agg_cfg = aggregator.get_generation_config()
471
- _agg_cfg["num_beams"] = _agg_cfg["num_beams"] * 2
472
- aggregator.update_generation_config(**_agg_cfg)
473
-
474
- logger.info("Loading OCR model")
475
- with contextlib.redirect_stdout(None):
476
- ocr_model = ocr_predictor(
477
- "db_resnet50",
478
- "crnn_mobilenet_v3_large",
479
- pretrained=True,
480
- assume_straight_pages=True,
481
- )
482
-
483
- # load the examples
484
- name_to_path = load_example_filenames(_here / "examples")
485
- logger.info(f"Loaded {len(name_to_path)} examples")
486
-
487
- demo = gr.Blocks(title="Document Summarization")
488
- _examples = list(name_to_path.keys())
489
- logger.info("Starting app instance")
490
- with demo:
491
- gr.Markdown(
492
- """# Document Summarization with Long-Document Transformers
493
-
494
- An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://hf.co/datasets/kmfoda/booksum). Architectures [in this demo](https://hf.co/spaces/pszemraj/document-summarization) are [LongT5-base](https://hf.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://hf.co/pszemraj/pegasus-x-large-book-summary).
495
-
496
- **Want more performance?** Run this demo from a free [Google Colab GPU](https://colab.research.google.com/gist/pszemraj/52f67cf7326e780155812a6a1f9bb724/document-summarization-on-gpu.ipynb)
497
- """
498
- )
499
  with gr.Column():
500
- gr.Markdown(
501
- """## Load Inputs & Select Parameters
502
-
503
- Enter/paste text below, or upload a file. Pick a model & adjust params (_optional_), and press **Summarize!**
504
-
505
- See [the guide doc](https://gist.github.com/pszemraj/722a7ba443aa3a671b02d87038375519) for details.
506
- """
 
 
 
 
 
 
 
 
 
507
  )
508
- with gr.Row():
509
- with gr.Column(variant="compact"):
510
- model_name = gr.Dropdown(
511
- choices=MODEL_OPTIONS,
512
- value=MODEL_OPTIONS[0],
513
- label="Model Name",
514
- )
515
- num_beams = gr.Radio(
516
- choices=BEAM_OPTIONS,
517
- value=BEAM_OPTIONS[len(BEAM_OPTIONS) // 2],
518
- label="Beam Search: # of Beams",
519
- )
520
- load_examples_button = gr.Button(
521
- "Load Example in Dropdown",
522
- )
523
- load_file_button = gr.Button("Upload & Process File")
524
- with gr.Column(variant="compact"):
525
- example_name = gr.Dropdown(
526
- _examples,
527
- label="Examples",
528
- value=random.choice(_examples),
529
- )
530
- uploaded_file = gr.File(
531
- label="File Upload",
532
- file_count="single",
533
- file_types=[".txt", ".md", ".pdf"],
534
- type="filepath",
535
- )
536
- with gr.Row():
537
- input_text = gr.Textbox(
538
- lines=4,
539
- max_lines=8,
540
- label="Text to Summarize",
541
- placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
542
- )
543
- with gr.Column():
544
- gr.Markdown("## Generate Summary")
545
- with gr.Row():
546
- summarize_button = gr.Button(
547
- "Summarize!",
548
- variant="primary",
549
- )
550
- gr.Markdown(
551
- "_Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios._"
552
- )
553
- output_text = gr.HTML("<em>Output will appear below:</em>")
554
- with gr.Column():
555
- gr.Markdown("### Results & Scores")
556
- with gr.Row():
557
- with gr.Column(variant="compact"):
558
- gr.Markdown(
559
- "Download the summary as a text file, with parameters and scores."
560
- )
561
- text_file = gr.File(
562
- label="Download as Text File",
563
- file_count="single",
564
- type="filepath",
565
- interactive=False,
566
- )
567
- with gr.Column(variant="compact"):
568
- gr.Markdown(
569
- "Scores **roughly** represent the summary quality as a measure of the model's 'confidence'. less-negative numbers (closer to 0) are better."
570
- )
571
- summary_scores = gr.Textbox(
572
- label="Summary Scores",
573
- placeholder="Summary scores will appear here",
574
- )
575
- with gr.Column(variant="panel"):
576
- gr.Markdown("### **Summary Output**")
577
- summary_text = gr.HTML(
578
- label="Summary",
579
- value="<i>Summary will appear here!</i>",
580
- )
581
- with gr.Column():
582
- gr.Markdown("### **Aggregate Summary Batches**")
583
- with gr.Row():
584
- aggregate_button = gr.Button(
585
- "Aggregate!",
586
- variant="primary",
587
- )
588
- gr.Markdown(
589
- f"""Aggregate the above batches into a cohesive summary.
590
- - A secondary instruct-tuned LM consolidates info
591
- - Current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
592
- """
593
- )
594
- with gr.Column(variant="panel"):
595
- aggregated_summary = gr.HTML(
596
- label="Aggregate Summary",
597
- value="<i>Aggregate summary will appear here!</i>",
598
- )
599
-
600
- with gr.Column():
601
- gr.Markdown(
602
- """### Advanced Settings
603
-
604
- Refer to [the guide doc](https://gist.github.com/pszemraj/722a7ba443aa3a671b02d87038375519) for what these are, and how they impact _quality_ and _speed_.
605
- """
606
- )
607
- with gr.Row():
608
- length_penalty = gr.Slider(
609
- minimum=0.3,
610
- maximum=1.1,
611
- label="length penalty",
612
- value=0.7,
613
- step=0.05,
614
- )
615
- token_batch_length = gr.Radio(
616
- choices=TOKEN_BATCH_OPTIONS,
617
- label="token batch length",
618
- # select median option
619
- value=TOKEN_BATCH_OPTIONS[len(TOKEN_BATCH_OPTIONS) // 2],
620
- )
621
-
622
- with gr.Row():
623
- repetition_penalty = gr.Slider(
624
- minimum=1.0,
625
- maximum=5.0,
626
- label="repetition penalty",
627
- value=1.5,
628
- step=0.1,
629
- )
630
- no_repeat_ngram_size = gr.Radio(
631
- choices=[2, 3, 4, 5],
632
- label="no repeat ngram size",
633
- value=3,
634
- )
635
- predrop_stopwords = gr.Checkbox(
636
- label="Drop Stopwords (Pre-Truncation)",
637
- value=False,
638
- )
639
-
640
- load_examples_button.click(
641
- fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
642
- )
643
-
644
- load_file_button.click(
645
- fn=load_uploaded_file, inputs=uploaded_file, outputs=[input_text]
646
- )
647
 
648
- summarize_button.click(
649
- fn=proc_submission,
650
- inputs=[
651
- input_text,
652
- model_name,
653
- num_beams,
654
- token_batch_length,
655
- length_penalty,
656
- repetition_penalty,
657
- no_repeat_ngram_size,
658
- predrop_stopwords,
659
- ],
660
- outputs=[output_text, summary_text, summary_scores, text_file],
661
- )
662
- aggregate_button.click(
663
- fn=aggregate_text,
664
- inputs=[summary_text, text_file],
665
- outputs=[aggregated_summary],
666
- )
667
- demo.launch(share=args.share, debug=True)
 
 
 
1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import gradio as gr
4
+ from summarize import summarize_text
5
+ from pdf2text import convert_PDF_to_Text
6
+ import logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
 
 
 
 
10
 
11
+ EXAMPLES_DIR = "examples"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def load_examples():
14
+ name_to_path = {}
15
+ if os.path.exists(EXAMPLES_DIR):
16
+ for file in os.listdir(EXAMPLES_DIR):
17
+ if file.endswith(".txt"):
18
+ name = file.replace(".txt", "")
19
+ name_to_path[name] = os.path.join(EXAMPLES_DIR, file)
20
+ logger.info(f"Loaded {len(name_to_path)} examples")
21
+ return name_to_path
22
 
23
+ def get_example_text(example_name, name_to_path):
24
+ path = name_to_path.get(example_name)
25
+ if path and os.path.exists(path):
26
+ with open(path, "r", encoding="utf-8") as f:
27
+ return f.read()
28
+ return ""
29
 
30
+ name_to_path = load_examples()
 
 
31
 
32
+ def summarize_interface(input_text, summary_length, summary_type, use_ocr):
 
 
 
 
 
 
 
 
 
 
33
  try:
34
+ if input_text.endswith(".pdf") and os.path.exists(input_text):
35
+ result_text = convert_PDF_to_Text(input_text, use_ocr=use_ocr)
36
+ elif os.path.isfile(input_text):
37
+ with open(input_text, "r", encoding="utf-8") as f:
38
+ result_text = f.read()
 
 
 
 
 
 
 
 
 
 
39
  else:
40
+ result_text = input_text
41
+ summary = summarize_text(result_text, summary_length, summary_type)
42
+ return summary
 
43
  except Exception as e:
44
+ logger.exception("Summarization failed:")
45
+ return f" Summarization failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ example_names = list(name_to_path.keys())
48
+ default_example = example_names[0] if example_names else None
49
 
50
+ with gr.Blocks() as demo:
51
+ gr.Markdown("# DocSummarizer
52
+ 使用 AI 自動摘要你的文件 📄")
 
 
 
53
 
54
+ with gr.Row():
55
+ input_textbox = gr.Textbox(label="Text to Summarize (or PDF path)", lines=15, placeholder="輸入或貼上文字,或提供 txt/pdf 檔案路徑")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  with gr.Column():
57
+ summary_length = gr.Slider(50, 1000, value=250, label="Summary Length")
58
+ summary_type = gr.Radio(choices=["map", "map-reduce"], value="map-reduce", label="Summarization Strategy")
59
+ use_ocr = gr.Checkbox(label="Use OCR for PDF", value=False)
60
+ submit_button = gr.Button("Summarize")
61
+
62
+ output_textbox = gr.Textbox(label="Summarized Output", lines=15)
63
+ submit_button.click(fn=summarize_interface, inputs=[input_textbox, summary_length, summary_type, use_ocr], outputs=output_textbox)
64
+
65
+ if default_example:
66
+ with gr.Row():
67
+ gr.Examples(
68
+ examples=[[name] for name in example_names],
69
+ inputs=input_textbox,
70
+ label="📚 範例檔案",
71
+ fn=lambda name: get_example_text(name, name_to_path),
72
+ cache_examples=False
73
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdf2text.py CHANGED
@@ -1,33 +1,53 @@
1
- from pdf2image import convert_from_path
2
  import pytesseract
3
  from PyPDF2 import PdfReader
4
- import tempfile
5
- import os
6
 
7
  def extract_text_simple(pdf_path: str) -> str:
8
- """使用 PyPDF2 直接提取 PDF 純文字"""
9
- try:
10
- with open(pdf_path, "rb") as f:
11
- reader = PdfReader(f)
12
- return "\n\n".join(page.extract_text() or "" for page in reader.pages)
13
- except Exception as e:
14
- return f"❌ PDF 讀取錯誤: {e}"
 
 
 
15
 
16
- def extract_text_ocr(pdf_path: str) -> str:
17
- """使用 OCR 擷取 PDF 的圖片並辨識成文字"""
18
- try:
19
- images = convert_from_path(pdf_path, dpi=300)
20
- text = ""
21
- for i, img in enumerate(images):
22
- gray = img.convert('L')
23
- page_text = pytesseract.image_to_string(gray, lang='chi_tra')
24
- text += f"\n\n--- Page {i+1} ---\n\n" + page_text
25
- return text
26
- except Exception as e:
27
- return f"❌ OCR 擷取失敗: {e}"
28
 
29
- def extract_text(pdf_path: str, mode: str = "simple") -> str:
30
- """依模式選擇擷取方式:simple 或 ocr"""
 
 
31
  if mode == "ocr":
32
  return extract_text_ocr(pdf_path)
33
- return extract_text_simple(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import pytesseract
3
  from PyPDF2 import PdfReader
4
+ from pdf2image import convert_from_path
5
+ from typing import Literal
6
 
7
  def extract_text_simple(pdf_path: str) -> str:
8
+ """
9
+ 使用 PyPDF2 解析 PDF 純文字
10
+ """
11
+ reader = PdfReader(pdf_path)
12
+ all_text = []
13
+ for page in reader.pages:
14
+ text = page.extract_text()
15
+ if text:
16
+ all_text.append(text.strip())
17
+ return "\n".join(all_text)
18
 
19
+ def extract_text_ocr(pdf_path: str, dpi: int = 300) -> str:
20
+ """
21
+ 使用 Tesseract OCR 提取圖片形式的 PDF 內容
22
+ """
23
+ images = convert_from_path(pdf_path, dpi=dpi)
24
+ all_text = []
25
+ for img in images:
26
+ text = pytesseract.image_to_string(img, lang="chi_tra+eng")
27
+ if text:
28
+ all_text.append(text.strip())
29
+ return "\n".join(all_text)
 
30
 
31
+ def extract_text(pdf_path: str, mode: Literal["simple", "ocr"] = "simple") -> str:
32
+ """
33
+ 根據模式選擇提取方法
34
+ """
35
  if mode == "ocr":
36
  return extract_text_ocr(pdf_path)
37
+ else:
38
+ return extract_text_simple(pdf_path)
39
+
40
+ # 為 app.py 提供相容介面
41
+ def convert_PDF_to_Text(pdf_path: str, ocr_model=None, max_pages: int = 20) -> dict:
42
+ """
43
+ 模擬 app.py 所需的 convert_PDF_to_Text 介面
44
+ """
45
+ text = extract_text(pdf_path, mode="ocr" if ocr_model else "simple")
46
+ return {
47
+ "converted_text": text,
48
+ "source_path": pdf_path,
49
+ "used_ocr": bool(ocr_model),
50
+ "page_count": "N/A",
51
+ }
52
+
53
+ convert_pdf_to_text = convert_PDF_to_Text