Spaces:
Runtime error
Runtime error
改进效率
Browse files- crazy_functions/代码重写为全英文_多线程.py +11 -14
crazy_functions/代码重写为全英文_多线程.py
CHANGED
|
@@ -10,16 +10,13 @@ def extract_code_block_carefully(txt):
|
|
| 10 |
txt_out = '```'.join(splitted[1:-1])
|
| 11 |
return txt_out
|
| 12 |
|
| 13 |
-
def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=True):
|
| 14 |
-
from transformers import GPT2TokenizerFast
|
| 15 |
-
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
| 16 |
-
get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"])
|
| 17 |
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
| 18 |
-
if
|
| 19 |
return [txt_tocut]
|
| 20 |
else:
|
| 21 |
lines = txt_tocut.split('\n')
|
| 22 |
-
estimated_line_cut = limit /
|
| 23 |
estimated_line_cut = int(estimated_line_cut)
|
| 24 |
for cnt in reversed(range(estimated_line_cut)):
|
| 25 |
if must_break_at_empty_line:
|
|
@@ -27,7 +24,7 @@ def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=Tr
|
|
| 27 |
print(cnt)
|
| 28 |
prev = "\n".join(lines[:cnt])
|
| 29 |
post = "\n".join(lines[cnt:])
|
| 30 |
-
if
|
| 31 |
if cnt == 0:
|
| 32 |
print('what the f?')
|
| 33 |
raise RuntimeError("存在一行极长的文本!")
|
|
@@ -86,12 +83,12 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt,
|
|
| 86 |
|
| 87 |
|
| 88 |
# 第5步:Token限制下的截断与处理
|
| 89 |
-
MAX_TOKEN =
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
|
| 96 |
|
| 97 |
# 第6步:任务函数
|
|
@@ -107,7 +104,7 @@ def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt,
|
|
| 107 |
try:
|
| 108 |
gpt_say = ""
|
| 109 |
# 分解代码文件
|
| 110 |
-
file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, MAX_TOKEN)
|
| 111 |
for file_content_partial in file_content_breakdown:
|
| 112 |
i_say = i_say_template(fp, file_content_partial)
|
| 113 |
# # ** gpt request **
|
|
|
|
| 10 |
txt_out = '```'.join(splitted[1:-1])
|
| 11 |
return txt_out
|
| 12 |
|
| 13 |
+
def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit, must_break_at_empty_line=True):
|
|
|
|
|
|
|
|
|
|
| 14 |
def cut(txt_tocut, must_break_at_empty_line): # 递归
|
| 15 |
+
if get_token_fn(txt_tocut) <= limit:
|
| 16 |
return [txt_tocut]
|
| 17 |
else:
|
| 18 |
lines = txt_tocut.split('\n')
|
| 19 |
+
estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
|
| 20 |
estimated_line_cut = int(estimated_line_cut)
|
| 21 |
for cnt in reversed(range(estimated_line_cut)):
|
| 22 |
if must_break_at_empty_line:
|
|
|
|
| 24 |
print(cnt)
|
| 25 |
prev = "\n".join(lines[:cnt])
|
| 26 |
post = "\n".join(lines[cnt:])
|
| 27 |
+
if get_token_fn(prev) < limit: break
|
| 28 |
if cnt == 0:
|
| 29 |
print('what the f?')
|
| 30 |
raise RuntimeError("存在一行极长的文本!")
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
# 第5步:Token限制下的截断与处理
|
| 86 |
+
MAX_TOKEN = 3000
|
| 87 |
+
from transformers import GPT2TokenizerFast
|
| 88 |
+
print('加载tokenizer中')
|
| 89 |
+
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
| 90 |
+
get_token_fn = lambda txt: len(tokenizer(txt)["input_ids"])
|
| 91 |
+
print('加载tokenizer结束')
|
| 92 |
|
| 93 |
|
| 94 |
# 第6步:任务函数
|
|
|
|
| 104 |
try:
|
| 105 |
gpt_say = ""
|
| 106 |
# 分解代码文件
|
| 107 |
+
file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, get_token_fn, MAX_TOKEN)
|
| 108 |
for file_content_partial in file_content_breakdown:
|
| 109 |
i_say = i_say_template(fp, file_content_partial)
|
| 110 |
# # ** gpt request **
|