Spaces:
Runtime error
Runtime error
重命名一些函数
Browse files- functional.py → core_functional.py +1 -1
- functional_crazy.py → crazy_functional.py +9 -1
- crazy_functions/crazy_utils.py +58 -0
- crazy_functions/代码重写为全英文_多线程.py +1 -27
- crazy_functions/批量翻译PDF文档_多线程.py +255 -0
- crazy_functions/高级功能函数模板.py +7 -6
- main.py +4 -4
- request_llm/bridge_chatgpt.py +28 -12
- request_llm/bridge_tgui.py +5 -5
- requirements.txt +1 -0
functional.py → core_functional.py
RENAMED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
# 默认按钮颜色是 secondary
|
| 5 |
from toolbox import clear_line_break
|
| 6 |
|
| 7 |
-
def
|
| 8 |
return {
|
| 9 |
"英语学术润色": {
|
| 10 |
# 前言
|
|
|
|
| 4 |
# 默认按钮颜色是 secondary
|
| 5 |
from toolbox import clear_line_break
|
| 6 |
|
| 7 |
+
def get_core_functions():
|
| 8 |
return {
|
| 9 |
"英语学术润色": {
|
| 10 |
# 前言
|
functional_crazy.py → crazy_functional.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from toolbox import HotReload # HotReload 的意思是热更新,修改函数插件后,不需要重启程序,代码直接生效
|
| 2 |
|
| 3 |
-
def
|
| 4 |
###################### 第一组插件 ###########################
|
| 5 |
# [第一组插件]: 最早期编写的项目插件和一些demo
|
| 6 |
from crazy_functions.读文章写摘要 import 读文章写摘要
|
|
@@ -97,6 +97,14 @@ def get_crazy_functionals():
|
|
| 97 |
"Function": HotReload(下载arxiv论文并翻译摘要)
|
| 98 |
}
|
| 99 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
except Exception as err:
|
| 101 |
print(f'[下载arxiv论文并翻译摘要] 插件导入失败 {str(err)}')
|
| 102 |
|
|
|
|
| 1 |
from toolbox import HotReload # HotReload 的意思是热更新,修改函数插件后,不需要重启程序,代码直接生效
|
| 2 |
|
| 3 |
+
def get_crazy_functions():
|
| 4 |
###################### 第一组插件 ###########################
|
| 5 |
# [第一组插件]: 最早期编写的项目插件和一些demo
|
| 6 |
from crazy_functions.读文章写摘要 import 读文章写摘要
|
|
|
|
| 97 |
"Function": HotReload(下载arxiv论文并翻译摘要)
|
| 98 |
}
|
| 99 |
})
|
| 100 |
+
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
|
| 101 |
+
function_plugins.update({
|
| 102 |
+
"批量翻译PDF文档(多线程)": {
|
| 103 |
+
"Color": "stop",
|
| 104 |
+
"AsButton": False, # 加入下拉菜单中
|
| 105 |
+
"Function": HotReload(批量翻译PDF文档)
|
| 106 |
+
}
|
| 107 |
+
})
|
| 108 |
except Exception as err:
|
| 109 |
print(f'[下载arxiv论文并翻译摘要] 插件导入失败 {str(err)}')
|
| 110 |
|
crazy_functions/crazy_utils.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
|
| 5 |
+
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
| 6 |
+
if get_token_fn(txt_tocut) <= limit:
|
| 7 |
+
return [txt_tocut]
|
| 8 |
+
else:
|
| 9 |
+
lines = txt_tocut.split('\n')
|
| 10 |
+
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
| 11 |
+
estimated_line_cut = int(estimated_line_cut)
|
| 12 |
+
for cnt in reversed(range(estimated_line_cut)):
|
| 13 |
+
if must_break_at_empty_line:
|
| 14 |
+
if lines[cnt] != "": continue
|
| 15 |
+
print(cnt)
|
| 16 |
+
prev = "\n".join(lines[:cnt])
|
| 17 |
+
post = "\n".join(lines[cnt:])
|
| 18 |
+
if get_token_fn(prev) < limit: break
|
| 19 |
+
if cnt == 0:
|
| 20 |
+
print('what the fuck ?')
|
| 21 |
+
raise RuntimeError("存在一行极长的文本!")
|
| 22 |
+
# print(len(post))
|
| 23 |
+
# 列表递归接龙
|
| 24 |
+
result = [prev]
|
| 25 |
+
result.extend(cut(post, must_break_at_empty_line))
|
| 26 |
+
return result
|
| 27 |
+
try:
|
| 28 |
+
return cut(txt, must_break_at_empty_line=True)
|
| 29 |
+
except RuntimeError:
|
| 30 |
+
return cut(txt, must_break_at_empty_line=False)
|
| 31 |
+
|
| 32 |
+
def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
| 33 |
+
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
| 34 |
+
if get_token_fn(txt_tocut) <= limit:
|
| 35 |
+
return [txt_tocut]
|
| 36 |
+
else:
|
| 37 |
+
lines = txt_tocut.split('\n')
|
| 38 |
+
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
| 39 |
+
estimated_line_cut = int(estimated_line_cut)
|
| 40 |
+
for cnt in reversed(range(estimated_line_cut)):
|
| 41 |
+
if must_break_at_empty_line:
|
| 42 |
+
if lines[cnt] != "": continue
|
| 43 |
+
print(cnt)
|
| 44 |
+
prev = "\n".join(lines[:cnt])
|
| 45 |
+
post = "\n".join(lines[cnt:])
|
| 46 |
+
if get_token_fn(prev) < limit: break
|
| 47 |
+
if cnt == 0:
|
| 48 |
+
print('what the fuck ?')
|
| 49 |
+
raise RuntimeError("存在一行极长的文本!")
|
| 50 |
+
# print(len(post))
|
| 51 |
+
# 列表递归接龙
|
| 52 |
+
result = [prev]
|
| 53 |
+
result.extend(cut(post, must_break_at_empty_line))
|
| 54 |
+
return result
|
| 55 |
+
try:
|
| 56 |
+
return cut(txt, must_break_at_empty_line=True)
|
| 57 |
+
except RuntimeError:
|
| 58 |
+
return cut(txt, must_break_at_empty_line=False)
|
crazy_functions/代码重写为全英文_多线程.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import threading
|
| 2 |
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
| 3 |
from toolbox import CatchException, write_results_to_file, report_execption
|
|
|
|
| 4 |
|
| 5 |
def extract_code_block_carefully(txt):
|
| 6 |
splitted = txt.split('```')
|
|
@@ -10,33 +11,6 @@ def extract_code_block_carefully(txt):
|
|
| 10 |
txt_out = '```'.join(splitted[1:-1])
|
| 11 |
return txt_out
|
| 12 |
|
| 13 |
-
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit, must_break_at_empty_line=True):
|
| 14 |
-
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
| 15 |
-
if get_token_fn(txt_tocut) <= limit:
|
| 16 |
-
return [txt_tocut]
|
| 17 |
-
else:
|
| 18 |
-
lines = txt_tocut.split('\n')
|
| 19 |
-
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
| 20 |
-
estimated_line_cut = int(estimated_line_cut)
|
| 21 |
-
for cnt in reversed(range(estimated_line_cut)):
|
| 22 |
-
if must_break_at_empty_line:
|
| 23 |
-
if lines[cnt] != "": continue
|
| 24 |
-
print(cnt)
|
| 25 |
-
prev = "\n".join(lines[:cnt])
|
| 26 |
-
post = "\n".join(lines[cnt:])
|
| 27 |
-
if get_token_fn(prev) < limit: break
|
| 28 |
-
if cnt == 0:
|
| 29 |
-
print('what the f?')
|
| 30 |
-
raise RuntimeError("存在一行极长的文本!")
|
| 31 |
-
print(len(post))
|
| 32 |
-
# 列表递归接龙
|
| 33 |
-
result = [prev]
|
| 34 |
-
result.extend(cut(post, must_break_at_empty_line))
|
| 35 |
-
return result
|
| 36 |
-
try:
|
| 37 |
-
return cut(txt, must_break_at_empty_line=True)
|
| 38 |
-
except RuntimeError:
|
| 39 |
-
return cut(txt, must_break_at_empty_line=False)
|
| 40 |
|
| 41 |
|
| 42 |
def break_txt_into_half_at_some_linebreak(txt):
|
|
|
|
| 1 |
import threading
|
| 2 |
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
| 3 |
from toolbox import CatchException, write_results_to_file, report_execption
|
| 4 |
+
from .crazy_utils import breakdown_txt_to_satisfy_token_limit
|
| 5 |
|
| 6 |
def extract_code_block_carefully(txt):
|
| 7 |
splitted = txt.split('```')
|
|
|
|
| 11 |
txt_out = '```'.join(splitted[1:-1])
|
| 12 |
return txt_out
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def break_txt_into_half_at_some_linebreak(txt):
|
crazy_functions/批量翻译PDF文档_多线程.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
|
| 2 |
+
import re
|
| 3 |
+
import unicodedata
|
| 4 |
+
fast_debug = False
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def is_paragraph_break(match):
|
| 8 |
+
"""
|
| 9 |
+
根据给定的匹配结果来判断换行符是否表示段落分隔。
|
| 10 |
+
如果换行符前为句子结束标志(句号,感叹号,问号),且下一个字符为大写字母,则换行符更有可能表示段落分隔。
|
| 11 |
+
也可以根据之前的内容长度来判断段落是否已经足够长。
|
| 12 |
+
"""
|
| 13 |
+
prev_char, next_char = match.groups()
|
| 14 |
+
|
| 15 |
+
# 句子结束标志
|
| 16 |
+
sentence_endings = ".!?"
|
| 17 |
+
|
| 18 |
+
# 设定一个最小段落长度阈值
|
| 19 |
+
min_paragraph_length = 140
|
| 20 |
+
|
| 21 |
+
if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
|
| 22 |
+
return "\n\n"
|
| 23 |
+
else:
|
| 24 |
+
return " "
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def normalize_text(text):
|
| 28 |
+
"""
|
| 29 |
+
通过把连字(ligatures)等文本特殊符号转换为其基本形式来对文本进行归一化处理。
|
| 30 |
+
例如,将连字 "fi" 转换为 "f" 和 "i"。
|
| 31 |
+
"""
|
| 32 |
+
# 对文本进行归一化处理,分解连字
|
| 33 |
+
normalized_text = unicodedata.normalize("NFKD", text)
|
| 34 |
+
|
| 35 |
+
# 替换其他特殊字符
|
| 36 |
+
cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
|
| 37 |
+
|
| 38 |
+
return cleaned_text
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def clean_text(raw_text):
|
| 42 |
+
"""
|
| 43 |
+
对从 PDF 提取出的原始文本进行清洗和格式化处理。
|
| 44 |
+
1. 对原始文本进行归一化处理。
|
| 45 |
+
2. 替换跨行的连词,例如 “Espe-\ncially” 转换为 “Especially”。
|
| 46 |
+
3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换。
|
| 47 |
+
"""
|
| 48 |
+
# 对文本进行归一化处理
|
| 49 |
+
normalized_text = normalize_text(raw_text)
|
| 50 |
+
|
| 51 |
+
# 替换跨行的连词
|
| 52 |
+
text = re.sub(r'(\w+-\n\w+)',
|
| 53 |
+
lambda m: m.group(1).replace('-\n', ''), normalized_text)
|
| 54 |
+
|
| 55 |
+
# 根据前后相邻字符的特点,找到原文本中的换行符
|
| 56 |
+
newlines = re.compile(r'(\S)\n(\S)')
|
| 57 |
+
|
| 58 |
+
# 根据 heuristic 规则,用空格或段落分隔符替换原换行符
|
| 59 |
+
final_text = re.sub(newlines, lambda m: m.group(
|
| 60 |
+
1) + is_paragraph_break(m) + m.group(2), text)
|
| 61 |
+
|
| 62 |
+
return final_text.strip()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def read_and_clean_pdf_text(fp):
|
| 66 |
+
import fitz, re
|
| 67 |
+
import numpy as np
|
| 68 |
+
# file_content = ""
|
| 69 |
+
with fitz.open(fp) as doc:
|
| 70 |
+
meta_txt = []
|
| 71 |
+
meta_font = []
|
| 72 |
+
for page in doc:
|
| 73 |
+
# file_content += page.get_text()
|
| 74 |
+
text_areas = page.get_text("dict") # 获取页面上的文本信息
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# # 行元提取 for each word segment with in line for each line for each block
|
| 78 |
+
# meta_txt.extend( [ ["".join( [wtf['text'] for wtf in l['spans'] ]) for l in t['lines'] ] for t in text_areas['blocks'] if 'lines' in t])
|
| 79 |
+
# meta_font.extend([ [ np.mean([wtf['size'] for wtf in l['spans'] ]) for l in t['lines'] ] for t in text_areas['blocks'] if 'lines' in t])
|
| 80 |
+
|
| 81 |
+
# 块元提取 for each word segment with in line for each line for each block
|
| 82 |
+
meta_txt.extend( [ " ".join(["".join( [wtf['text'] for wtf in l['spans'] ]) for l in t['lines'] ]) for t in text_areas['blocks'] if 'lines' in t])
|
| 83 |
+
meta_font.extend([ np.mean( [ np.mean([wtf['size'] for wtf in l['spans'] ]) for l in t['lines'] ]) for t in text_areas['blocks'] if 'lines' in t])
|
| 84 |
+
|
| 85 |
+
def 把字符太少的块清除为回车(meta_txt):
|
| 86 |
+
for index, block_txt in enumerate(meta_txt):
|
| 87 |
+
if len(block_txt) < 100:
|
| 88 |
+
meta_txt[index] = '\n'
|
| 89 |
+
return meta_txt
|
| 90 |
+
meta_txt = 把字符太少的块清除为回车(meta_txt)
|
| 91 |
+
|
| 92 |
+
def 清理多余的空行(meta_txt):
|
| 93 |
+
for index in reversed(range(1, len(meta_txt))):
|
| 94 |
+
if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
|
| 95 |
+
meta_txt.pop(index)
|
| 96 |
+
return meta_txt
|
| 97 |
+
meta_txt = 清理多余的空行(meta_txt)
|
| 98 |
+
|
| 99 |
+
def 合并小写开头的段落块(meta_txt):
|
| 100 |
+
def starts_with_lowercase_word(s):
|
| 101 |
+
pattern = r"^[a-z]+"
|
| 102 |
+
match = re.match(pattern, s)
|
| 103 |
+
if match:
|
| 104 |
+
return True
|
| 105 |
+
else:
|
| 106 |
+
return False
|
| 107 |
+
for _ in range(100):
|
| 108 |
+
for index, block_txt in enumerate(meta_txt):
|
| 109 |
+
if starts_with_lowercase_word(block_txt):
|
| 110 |
+
if meta_txt[index-1]!='\n': meta_txt[index-1] += ' '
|
| 111 |
+
else: meta_txt[index-1] = ''
|
| 112 |
+
meta_txt[index-1] += meta_txt[index]
|
| 113 |
+
meta_txt[index] = '\n'
|
| 114 |
+
return meta_txt
|
| 115 |
+
meta_txt = 合并小写开头的��落块(meta_txt)
|
| 116 |
+
meta_txt = 清理多余的空行(meta_txt)
|
| 117 |
+
|
| 118 |
+
meta_txt = '\n'.join(meta_txt)
|
| 119 |
+
# 清除重复的换行
|
| 120 |
+
for _ in range(5):
|
| 121 |
+
meta_txt = meta_txt.replace('\n\n','\n')
|
| 122 |
+
|
| 123 |
+
# 换行 -> 双换行
|
| 124 |
+
meta_txt = meta_txt.replace('\n', '\n\n')
|
| 125 |
+
|
| 126 |
+
# print(meta_txt)
|
| 127 |
+
|
| 128 |
+
return meta_txt
|
| 129 |
+
|
| 130 |
+
@CatchException
|
| 131 |
+
def 批量翻译PDF文档(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
|
| 132 |
+
import glob
|
| 133 |
+
import os
|
| 134 |
+
|
| 135 |
+
# 基本信息:功能、贡献者
|
| 136 |
+
chatbot.append([
|
| 137 |
+
"函数插件功能?",
|
| 138 |
+
"批量总结PDF文档。函数插件贡献者: Binary-Husky, ValeriaWong, Eralien"])
|
| 139 |
+
yield chatbot, history, '正常'
|
| 140 |
+
|
| 141 |
+
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
| 142 |
+
try:
|
| 143 |
+
import fitz, tiktoken
|
| 144 |
+
except:
|
| 145 |
+
report_execption(chatbot, history,
|
| 146 |
+
a=f"解析项目: {txt}",
|
| 147 |
+
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
|
| 148 |
+
yield chatbot, history, '正常'
|
| 149 |
+
return
|
| 150 |
+
|
| 151 |
+
# 清空历史,以免输入溢出
|
| 152 |
+
history = []
|
| 153 |
+
|
| 154 |
+
# 检测输入参数,如没有给定输入参数,直接退出
|
| 155 |
+
if os.path.exists(txt):
|
| 156 |
+
project_folder = txt
|
| 157 |
+
else:
|
| 158 |
+
if txt == "":
|
| 159 |
+
txt = '空空如也的输入栏'
|
| 160 |
+
report_execption(chatbot, history,
|
| 161 |
+
a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}")
|
| 162 |
+
yield chatbot, history, '正常'
|
| 163 |
+
return
|
| 164 |
+
|
| 165 |
+
# 搜索需要处理的文件清单
|
| 166 |
+
file_manifest = [f for f in glob.glob(
|
| 167 |
+
f'{project_folder}/**/*.pdf', recursive=True)]
|
| 168 |
+
|
| 169 |
+
# 如果没找到任何文件
|
| 170 |
+
if len(file_manifest) == 0:
|
| 171 |
+
report_execption(chatbot, history,
|
| 172 |
+
a=f"解析项目: {txt}", b=f"找不到任何.tex或.pdf文件: {txt}")
|
| 173 |
+
yield chatbot, history, '正常'
|
| 174 |
+
return
|
| 175 |
+
|
| 176 |
+
# 开始正式执行任务
|
| 177 |
+
yield from 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt):
|
| 181 |
+
import time
|
| 182 |
+
import glob
|
| 183 |
+
import os
|
| 184 |
+
import fitz
|
| 185 |
+
import tiktoken
|
| 186 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 187 |
+
print('begin analysis on:', file_manifest)
|
| 188 |
+
for index, fp in enumerate(file_manifest):
|
| 189 |
+
### 1. 读取PDF文件
|
| 190 |
+
file_content = read_and_clean_pdf_text(fp)
|
| 191 |
+
### 2. 递归地切割PDF文件
|
| 192 |
+
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
| 193 |
+
enc = tiktoken.get_encoding("gpt2")
|
| 194 |
+
TOKEN_LIMIT_PER_FRAGMENT = 2048
|
| 195 |
+
get_token_num = lambda txt: len(enc.encode(txt))
|
| 196 |
+
# 分解
|
| 197 |
+
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
| 198 |
+
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
| 199 |
+
print([get_token_num(frag) for frag in paper_fragments])
|
| 200 |
+
### 3. 逐个段落翻译
|
| 201 |
+
## 3.1. 多线程开始
|
| 202 |
+
from request_llm.bridge_chatgpt import predict_no_ui_long_connection
|
| 203 |
+
n_frag = len(paper_fragments)
|
| 204 |
+
# 异步原子
|
| 205 |
+
mutable = [["", time.time()] for _ in range(n_frag)]
|
| 206 |
+
# 翻译函数
|
| 207 |
+
def translate_(index, fragment, mutable):
|
| 208 |
+
i_say = f"以下是你需要翻译的文章段落:{fragment}"
|
| 209 |
+
# 请求gpt,需要一段时间
|
| 210 |
+
gpt_say = predict_no_ui_long_connection(
|
| 211 |
+
inputs=i_say, top_p=top_p, temperature=temperature, history=[], # ["请翻译:" if len(previous_result)!=0 else "", previous_result],
|
| 212 |
+
sys_prompt="请你作为一个学术翻译,负责将给定的文章段落翻译成中文,要求语言简洁、精准、凝练。你只需要给出翻译后的文本,不能重复原文。",
|
| 213 |
+
observe_window=mutable[index])
|
| 214 |
+
return gpt_say
|
| 215 |
+
### 4. 异步任务开始
|
| 216 |
+
executor = ThreadPoolExecutor(max_workers=16)
|
| 217 |
+
# Submit tasks to the pool
|
| 218 |
+
futures = [executor.submit(translate_, index, frag, mutable) for index, frag in enumerate(paper_fragments)]
|
| 219 |
+
|
| 220 |
+
### 5. UI主线程,在任务期间提供实时的前端显示
|
| 221 |
+
cnt = 0
|
| 222 |
+
while True:
|
| 223 |
+
cnt += 1
|
| 224 |
+
time.sleep(1)
|
| 225 |
+
worker_done = [h.done() for h in futures]
|
| 226 |
+
if all(worker_done):
|
| 227 |
+
executor.shutdown(); break
|
| 228 |
+
# 更好的UI视觉效果
|
| 229 |
+
observe_win = []
|
| 230 |
+
# 每个线程都要喂狗(看门狗)
|
| 231 |
+
for thread_index, _ in enumerate(worker_done):
|
| 232 |
+
mutable[thread_index][1] = time.time()
|
| 233 |
+
# 在前端打印些好玩的东西
|
| 234 |
+
for thread_index, _ in enumerate(worker_done):
|
| 235 |
+
print_something_really_funny = "[ ...`"+mutable[thread_index][0][-30:].replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"`... ]"
|
| 236 |
+
observe_win.append(print_something_really_funny)
|
| 237 |
+
stat_str = ''.join([f'执行中: {obs}\n\n' if not done else '已完成\n\n' for done, obs in zip(worker_done, observe_win)])
|
| 238 |
+
chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1))]; msg = "正常"
|
| 239 |
+
yield chatbot, history, msg
|
| 240 |
+
|
| 241 |
+
# Wait for tasks to complete
|
| 242 |
+
results = [future.result() for future in futures]
|
| 243 |
+
|
| 244 |
+
print(results)
|
| 245 |
+
# full_result += gpt_say
|
| 246 |
+
|
| 247 |
+
# history.extend([fp, full_result])
|
| 248 |
+
|
| 249 |
+
res = write_results_to_file(history)
|
| 250 |
+
chatbot.append(("完成了吗?", res)); msg = "完成"
|
| 251 |
+
yield chatbot, history, msg
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# if __name__ == '__main__':
|
| 255 |
+
# pro()
|
crazy_functions/高级功能函数模板.py
CHANGED
|
@@ -14,12 +14,13 @@ def 高阶功能模板函数(txt, top_p, temperature, chatbot, history, systemPr
|
|
| 14 |
i_say = f'历史中哪些事件发生在{currentMonth}月{currentDay}日?列举两条并发送相关图片。发送图片时,请使用Markdown,将Unsplash API中的PUT_YOUR_QUERY_HERE替换成描述该事件的一个最重要的单词。'
|
| 15 |
chatbot.append((i_say, "[Local Message] waiting gpt response."))
|
| 16 |
yield chatbot, history, '正常' # 由于请求gpt需要一段时间,我们先及时地做一次状态显示
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
chatbot[-1] = (i_say, gpt_say)
|
| 24 |
history.append(i_say);history.append(gpt_say)
|
| 25 |
yield chatbot, history, '正常' # 显示
|
|
|
|
| 14 |
i_say = f'历史中哪些事件发生在{currentMonth}月{currentDay}日?列举两条并发送相关图片。发送图片时,请使用Markdown,将Unsplash API中的PUT_YOUR_QUERY_HERE替换成描述该事件的一个最重要的单词。'
|
| 15 |
chatbot.append((i_say, "[Local Message] waiting gpt response."))
|
| 16 |
yield chatbot, history, '正常' # 由于请求gpt需要一段时间,我们先及时地做一次状态显示
|
| 17 |
+
try:
|
| 18 |
+
# history = [] 每次询问不携带之前的询问历史
|
| 19 |
+
gpt_say = predict_no_ui_long_connection(
|
| 20 |
+
inputs=i_say, top_p=top_p, temperature=temperature, history=[],
|
| 21 |
+
sys_prompt="当你想发送一张照片时,请使用Markdown, 并且不要有反斜线, 不要用代码块。使用 Unsplash API (https://source.unsplash.com/1280x720/? < PUT_YOUR_QUERY_HERE >)。") # 请求gpt,需要一段时间
|
| 22 |
+
except:
|
| 23 |
+
print("")
|
| 24 |
chatbot[-1] = (i_say, gpt_say)
|
| 25 |
history.append(i_say);history.append(gpt_say)
|
| 26 |
yield chatbot, history, '正常' # 显示
|
main.py
CHANGED
|
@@ -23,12 +23,12 @@ except:logging.basicConfig(filename="gpt_log/chat_secrets.log", level=logging.IN
|
|
| 23 |
print("所有问询记录将自动保存在本地目录./gpt_log/chat_secrets.log, 请注意自我隐私保护哦!")
|
| 24 |
|
| 25 |
# 一些普通功能模块
|
| 26 |
-
from
|
| 27 |
-
functional =
|
| 28 |
|
| 29 |
# 高级函数插件
|
| 30 |
-
from
|
| 31 |
-
crazy_fns =
|
| 32 |
|
| 33 |
# 处理markdown文本格式的转变
|
| 34 |
gr.Chatbot.postprocess = format_io
|
|
|
|
| 23 |
print("所有问询记录将自动保存在本地目录./gpt_log/chat_secrets.log, 请注意自我隐私保护哦!")
|
| 24 |
|
| 25 |
# 一些普通功能模块
|
| 26 |
+
from core_functional import get_core_functions
|
| 27 |
+
functional = get_core_functions()
|
| 28 |
|
| 29 |
# 高级函数插件
|
| 30 |
+
from crazy_functional import get_crazy_functions
|
| 31 |
+
crazy_fns = get_crazy_functions()
|
| 32 |
|
| 33 |
# 处理markdown文本格式的转变
|
| 34 |
gr.Chatbot.postprocess = format_io
|
request_llm/bridge_chatgpt.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
import json
|
|
|
|
| 15 |
import gradio as gr
|
| 16 |
import logging
|
| 17 |
import traceback
|
|
@@ -73,11 +74,20 @@ def predict_no_ui(inputs, top_p, temperature, history=[], sys_prompt=""):
|
|
| 73 |
|
| 74 |
def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_prompt="", observe_window=None):
|
| 75 |
"""
|
| 76 |
-
发送至chatGPT,等待回复,一次性完成,不显示中间过程。但内部用stream
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
"""
|
|
|
|
| 79 |
headers, payload = generate_payload(inputs, top_p, temperature, history, system_prompt=sys_prompt, stream=True)
|
| 80 |
-
|
| 81 |
retry = 0
|
| 82 |
while True:
|
| 83 |
try:
|
|
@@ -109,10 +119,16 @@ def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_pr
|
|
| 109 |
if "content" in delta:
|
| 110 |
result += delta["content"]
|
| 111 |
print(delta["content"], end='')
|
| 112 |
-
if observe_window is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
else: raise RuntimeError("意外Json结构:"+delta)
|
| 114 |
if json_data['finish_reason'] == 'length':
|
| 115 |
-
raise ConnectionAbortedError("正常结束,但显示Token
|
| 116 |
return result
|
| 117 |
|
| 118 |
|
|
@@ -128,11 +144,11 @@ def predict(inputs, top_p, temperature, chatbot=[], history=[], system_prompt=''
|
|
| 128 |
additional_fn代表点击的哪个按钮,按钮见functional.py
|
| 129 |
"""
|
| 130 |
if additional_fn is not None:
|
| 131 |
-
import
|
| 132 |
-
importlib.reload(
|
| 133 |
-
|
| 134 |
-
if "PreProcess" in
|
| 135 |
-
inputs =
|
| 136 |
|
| 137 |
if stream:
|
| 138 |
raw_input = inputs
|
|
@@ -189,10 +205,10 @@ def predict(inputs, top_p, temperature, chatbot=[], history=[], system_prompt=''
|
|
| 189 |
chunk = get_full_error(chunk, stream_response)
|
| 190 |
error_msg = chunk.decode()
|
| 191 |
if "reduce the length" in error_msg:
|
| 192 |
-
chatbot[-1] = (chatbot[-1][0], "[Local Message]
|
| 193 |
history = [] # 清除历史
|
| 194 |
elif "Incorrect API key" in error_msg:
|
| 195 |
-
chatbot[-1] = (chatbot[-1][0], "[Local Message] Incorrect API key
|
| 196 |
elif "exceeded your current quota" in error_msg:
|
| 197 |
chatbot[-1] = (chatbot[-1][0], "[Local Message] You exceeded your current quota. OpenAI以账户额度不足为由,拒绝服务.")
|
| 198 |
else:
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
import json
|
| 15 |
+
import time
|
| 16 |
import gradio as gr
|
| 17 |
import logging
|
| 18 |
import traceback
|
|
|
|
| 74 |
|
| 75 |
def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_prompt="", observe_window=None):
|
| 76 |
"""
|
| 77 |
+
发送至chatGPT,等待回复,一次性完成,不显示中间过程。但内部用stream的方法避免中途网线被掐。
|
| 78 |
+
inputs:
|
| 79 |
+
是本次问询的输入
|
| 80 |
+
sys_prompt:
|
| 81 |
+
系统静默prompt
|
| 82 |
+
top_p, temperature:
|
| 83 |
+
chatGPT的内部调优参数
|
| 84 |
+
history:
|
| 85 |
+
是之前的对话列表
|
| 86 |
+
observe_window = None:
|
| 87 |
+
用于负责跨越线程传递已经输出的部分,大部分时候仅仅为了fancy的视觉效果,留空即可。observe_window[0]:观测窗。observe_window[1]:看门狗
|
| 88 |
"""
|
| 89 |
+
watch_dog_patience = 5 # 看门狗的耐心, 设置5秒即可
|
| 90 |
headers, payload = generate_payload(inputs, top_p, temperature, history, system_prompt=sys_prompt, stream=True)
|
|
|
|
| 91 |
retry = 0
|
| 92 |
while True:
|
| 93 |
try:
|
|
|
|
| 119 |
if "content" in delta:
|
| 120 |
result += delta["content"]
|
| 121 |
print(delta["content"], end='')
|
| 122 |
+
if observe_window is not None:
|
| 123 |
+
# 观测窗,把已经获取的数据显示出去
|
| 124 |
+
if len(observe_window) >= 1: observe_window[0] += delta["content"]
|
| 125 |
+
# 看门狗,如果超过期限没有喂狗,则终止
|
| 126 |
+
if len(observe_window) >= 2:
|
| 127 |
+
if (time.time()-observe_window[1]) > watch_dog_patience:
|
| 128 |
+
raise RuntimeError("程序终止。")
|
| 129 |
else: raise RuntimeError("意外Json结构:"+delta)
|
| 130 |
if json_data['finish_reason'] == 'length':
|
| 131 |
+
raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")
|
| 132 |
return result
|
| 133 |
|
| 134 |
|
|
|
|
| 144 |
additional_fn代表点击的哪个按钮,按钮见functional.py
|
| 145 |
"""
|
| 146 |
if additional_fn is not None:
|
| 147 |
+
import core_functional
|
| 148 |
+
importlib.reload(core_functional) # 热更新prompt
|
| 149 |
+
core_functional = core_functional.get_functions()
|
| 150 |
+
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
| 151 |
+
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
| 152 |
|
| 153 |
if stream:
|
| 154 |
raw_input = inputs
|
|
|
|
| 205 |
chunk = get_full_error(chunk, stream_response)
|
| 206 |
error_msg = chunk.decode()
|
| 207 |
if "reduce the length" in error_msg:
|
| 208 |
+
chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长,或历史数据过长. 历史缓存数据现已释放,您可以请再次尝试.")
|
| 209 |
history = [] # 清除历史
|
| 210 |
elif "Incorrect API key" in error_msg:
|
| 211 |
+
chatbot[-1] = (chatbot[-1][0], "[Local Message] Incorrect API key. OpenAI以提供了不正确的API_KEY为由,拒绝服务.")
|
| 212 |
elif "exceeded your current quota" in error_msg:
|
| 213 |
chatbot[-1] = (chatbot[-1][0], "[Local Message] You exceeded your current quota. OpenAI以账户额度不足为由,拒绝服务.")
|
| 214 |
else:
|
request_llm/bridge_tgui.py
CHANGED
|
@@ -101,11 +101,11 @@ def predict_tgui(inputs, top_p, temperature, chatbot=[], history=[], system_prom
|
|
| 101 |
additional_fn代表点击的哪个按钮,按钮见functional.py
|
| 102 |
"""
|
| 103 |
if additional_fn is not None:
|
| 104 |
-
import
|
| 105 |
-
importlib.reload(
|
| 106 |
-
|
| 107 |
-
if "PreProcess" in
|
| 108 |
-
inputs =
|
| 109 |
|
| 110 |
raw_input = "What I would like to say is the following: " + inputs
|
| 111 |
logging.info(f'[raw_input] {raw_input}')
|
|
|
|
| 101 |
additional_fn代表点击的哪个按钮,按钮见functional.py
|
| 102 |
"""
|
| 103 |
if additional_fn is not None:
|
| 104 |
+
import core_functional
|
| 105 |
+
importlib.reload(core_functional) # 热更新prompt
|
| 106 |
+
core_functional = core_functional.get_functions()
|
| 107 |
+
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
| 108 |
+
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
| 109 |
|
| 110 |
raw_input = "What I would like to say is the following: " + inputs
|
| 111 |
logging.info(f'[raw_input] {raw_input}')
|
requirements.txt
CHANGED
|
@@ -5,3 +5,4 @@ Markdown
|
|
| 5 |
latex2mathml
|
| 6 |
openai
|
| 7 |
transformers
|
|
|
|
|
|
| 5 |
latex2mathml
|
| 6 |
openai
|
| 7 |
transformers
|
| 8 |
+
numpy
|