Spaces:
Running
Running
| import markdown | |
| import re | |
| import os | |
| import math | |
| from textwrap import dedent | |
| from functools import lru_cache | |
| from pymdownx.superfences import fence_code_format | |
| from latex2mathml.converter import convert as tex2mathml | |
| from shared_utils.config_loader import get_conf as get_conf | |
| from shared_utils.text_mask import apply_gpt_academic_string_mask | |
| markdown_extension_configs = { | |
| "mdx_math": { | |
| "enable_dollar_delimiter": True, | |
| "use_gitlab_delimiters": False, | |
| }, | |
| } | |
| code_highlight_configs = { | |
| "pymdownx.superfences": { | |
| "css_class": "codehilite", | |
| "custom_fences": [ | |
| {"name": "mermaid", "class": "mermaid", "format": fence_code_format} | |
| ], | |
| }, | |
| "pymdownx.highlight": { | |
| "css_class": "codehilite", | |
| "guess_lang": True, | |
| # 'auto_title': True, | |
| # 'linenums': True | |
| }, | |
| } | |
| code_highlight_configs_block_mermaid = { | |
| "pymdownx.superfences": { | |
| "css_class": "codehilite", | |
| # "custom_fences": [ | |
| # {"name": "mermaid", "class": "mermaid", "format": fence_code_format} | |
| # ], | |
| }, | |
| "pymdownx.highlight": { | |
| "css_class": "codehilite", | |
| "guess_lang": True, | |
| # 'auto_title': True, | |
| # 'linenums': True | |
| }, | |
| } | |
| def tex2mathml_catch_exception(content, *args, **kwargs): | |
| try: | |
| content = tex2mathml(content, *args, **kwargs) | |
| except: | |
| content = content | |
| return content | |
| def replace_math_no_render(match): | |
| content = match.group(1) | |
| if "mode=display" in match.group(0): | |
| content = content.replace("\n", "</br>") | |
| return f'<font color="#00FF00">$$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$$</font>' | |
| else: | |
| return f'<font color="#00FF00">$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$</font>' | |
| def replace_math_render(match): | |
| content = match.group(1) | |
| if "mode=display" in match.group(0): | |
| if "\\begin{aligned}" in content: | |
| content = content.replace("\\begin{aligned}", "\\begin{array}") | |
| content = content.replace("\\end{aligned}", "\\end{array}") | |
| content = content.replace("&", " ") | |
| content = tex2mathml_catch_exception(content, display="block") | |
| return content | |
| else: | |
| return tex2mathml_catch_exception(content) | |
| def markdown_bug_hunt(content): | |
| """ | |
| 解决一个mdx_math的bug(单$包裹begin命令时多余<script>) | |
| """ | |
| content = content.replace( | |
| '<script type="math/tex">\n<script type="math/tex; mode=display">', | |
| '<script type="math/tex; mode=display">', | |
| ) | |
| content = content.replace("</script>\n</script>", "</script>") | |
| return content | |
| def is_equation(txt): | |
| """ | |
| 判定是否为公式 | 测试1 写出洛伦兹定律,使用tex格式公式 测试2 给出柯西不等式,使用latex格式 测试3 写出麦克斯韦方程组 | |
| """ | |
| if "```" in txt and "```reference" not in txt: | |
| return False | |
| if "$" not in txt and "\\[" not in txt: | |
| return False | |
| mathpatterns = { | |
| r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False}, # $...$ | |
| r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True}, # $$...$$ | |
| r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False}, # \[...\] | |
| # r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False}, # \(...\) | |
| # r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True}, # \begin...\end | |
| # r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False}, # $`...`$ | |
| } | |
| matches = [] | |
| for pattern, property in mathpatterns.items(): | |
| flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII | |
| matches.extend(re.findall(pattern, txt, flags)) | |
| if len(matches) == 0: | |
| return False | |
| contain_any_eq = False | |
| illegal_pattern = re.compile(r"[^\x00-\x7F]|echo") | |
| for match in matches: | |
| if len(match) != 3: | |
| return False | |
| eq_canidate = match[1] | |
| if illegal_pattern.search(eq_canidate): | |
| return False | |
| else: | |
| contain_any_eq = True | |
| return contain_any_eq | |
| def fix_markdown_indent(txt): | |
| # fix markdown indent | |
| if (" - " not in txt) or (". " not in txt): | |
| # do not need to fix, fast escape | |
| return txt | |
| # walk through the lines and fix non-standard indentation | |
| lines = txt.split("\n") | |
| pattern = re.compile(r"^\s+-") | |
| activated = False | |
| for i, line in enumerate(lines): | |
| if line.startswith("- ") or line.startswith("1. "): | |
| activated = True | |
| if activated and pattern.match(line): | |
| stripped_string = line.lstrip() | |
| num_spaces = len(line) - len(stripped_string) | |
| if (num_spaces % 4) == 3: | |
| num_spaces_should_be = math.ceil(num_spaces / 4) * 4 | |
| lines[i] = " " * num_spaces_should_be + stripped_string | |
| return "\n".join(lines) | |
| FENCED_BLOCK_RE = re.compile( | |
| dedent( | |
| r""" | |
| (?P<fence>^[ \t]*(?:~{3,}|`{3,}))[ ]* # opening fence | |
| ((\{(?P<attrs>[^\}\n]*)\})| # (optional {attrs} or | |
| (\.?(?P<lang>[\w#.+-]*)[ ]*)? # optional (.)lang | |
| (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot)[ ]*)?) # optional hl_lines) | |
| \n # newline (end of opening fence) | |
| (?P<code>.*?)(?<=\n) # the code block | |
| (?P=fence)[ ]*$ # closing fence | |
| """ | |
| ), | |
| re.MULTILINE | re.DOTALL | re.VERBOSE, | |
| ) | |
| def get_line_range(re_match_obj, txt): | |
| start_pos, end_pos = re_match_obj.regs[0] | |
| num_newlines_before = txt[: start_pos + 1].count("\n") | |
| line_start = num_newlines_before | |
| line_end = num_newlines_before + txt[start_pos:end_pos].count("\n") + 1 | |
| return line_start, line_end | |
| def fix_code_segment_indent(txt): | |
| lines = [] | |
| change_any = False | |
| txt_tmp = txt | |
| while True: | |
| re_match_obj = FENCED_BLOCK_RE.search(txt_tmp) | |
| if not re_match_obj: | |
| break | |
| if len(lines) == 0: | |
| lines = txt.split("\n") | |
| # 清空 txt_tmp 对应的位置方便下次搜索 | |
| start_pos, end_pos = re_match_obj.regs[0] | |
| txt_tmp = txt_tmp[:start_pos] + " " * (end_pos - start_pos) + txt_tmp[end_pos:] | |
| line_start, line_end = get_line_range(re_match_obj, txt) | |
| # 获取公共缩进 | |
| shared_indent_cnt = 1e5 | |
| for i in range(line_start, line_end): | |
| stripped_string = lines[i].lstrip() | |
| num_spaces = len(lines[i]) - len(stripped_string) | |
| if num_spaces < shared_indent_cnt: | |
| shared_indent_cnt = num_spaces | |
| # 修复缩进 | |
| if (shared_indent_cnt < 1e5) and (shared_indent_cnt % 4) == 3: | |
| num_spaces_should_be = math.ceil(shared_indent_cnt / 4) * 4 | |
| for i in range(line_start, line_end): | |
| add_n = num_spaces_should_be - shared_indent_cnt | |
| lines[i] = " " * add_n + lines[i] | |
| if not change_any: # 遇到第一个 | |
| change_any = True | |
| if change_any: | |
| return "\n".join(lines) | |
| else: | |
| return txt | |
| # 使用 lru缓存 加快转换速度 | |
| def markdown_convertion(txt): | |
| """ | |
| 将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。 | |
| """ | |
| pre = '<div class="markdown-body">' | |
| suf = "</div>" | |
| if txt.startswith(pre) and txt.endswith(suf): | |
| # print('警告,输入了已经经过转化的字符串,二次转化可能出问题') | |
| return txt # 已经被转化过,不需要再次转化 | |
| find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>' | |
| txt = fix_markdown_indent(txt) | |
| # txt = fix_code_segment_indent(txt) | |
| if is_equation(txt): # 有$标识的公式符号,且没有代码段```的标识 | |
| # convert everything to html format | |
| split = markdown.markdown(text="---") | |
| convert_stage_1 = markdown.markdown( | |
| text=txt, | |
| extensions=[ | |
| "sane_lists", | |
| "tables", | |
| "mdx_math", | |
| "pymdownx.superfences", | |
| "pymdownx.highlight", | |
| ], | |
| extension_configs={**markdown_extension_configs, **code_highlight_configs}, | |
| ) | |
| convert_stage_1 = markdown_bug_hunt(convert_stage_1) | |
| # 1. convert to easy-to-copy tex (do not render math) | |
| convert_stage_2_1, n = re.subn( | |
| find_equation_pattern, | |
| replace_math_no_render, | |
| convert_stage_1, | |
| flags=re.DOTALL, | |
| ) | |
| # 2. convert to rendered equation | |
| convert_stage_2_2, n = re.subn( | |
| find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL | |
| ) | |
| # cat them together | |
| return pre + convert_stage_2_1 + f"{split}" + convert_stage_2_2 + suf | |
| else: | |
| return ( | |
| pre | |
| + markdown.markdown( | |
| txt, | |
| extensions=[ | |
| "sane_lists", | |
| "tables", | |
| "pymdownx.superfences", | |
| "pymdownx.highlight", | |
| ], | |
| extension_configs=code_highlight_configs, | |
| ) | |
| + suf | |
| ) | |
| def close_up_code_segment_during_stream(gpt_reply): | |
| """ | |
| 在gpt输出代码的中途(输出了前面的```,但还没输出完后面的```),补上后面的``` | |
| Args: | |
| gpt_reply (str): GPT模型返回的回复字符串。 | |
| Returns: | |
| str: 返回一个新的字符串,将输出代码片段的“后面的```”补上。 | |
| """ | |
| if "```" not in gpt_reply: | |
| return gpt_reply | |
| if gpt_reply.endswith("```"): | |
| return gpt_reply | |
| # 排除了以上两个情况,我们 | |
| segments = gpt_reply.split("```") | |
| n_mark = len(segments) - 1 | |
| if n_mark % 2 == 1: | |
| return gpt_reply + "\n```" # 输出代码片段中! | |
| else: | |
| return gpt_reply | |
| def special_render_issues_for_mermaid(text): | |
| # 用不太优雅的方式处理一个core_functional.py中出现的mermaid渲染特例: | |
| # 我不希望"总结绘制脑图"prompt中的mermaid渲染出来 | |
| def get_special_case(): | |
| from core_functional import get_core_functions | |
| special_case = get_core_functions()["总结绘制脑图"]["Suffix"] | |
| return special_case | |
| if text.endswith(get_special_case()): text = text.replace("```mermaid", "```") | |
| return text | |
| def compat_non_markdown_input(text): | |
| """ | |
| 改善非markdown输入的显示效果,例如将空格转换为 ,将换行符转换为</br>等。 | |
| """ | |
| if "```" in text: | |
| # careful input:markdown输入 | |
| text = special_render_issues_for_mermaid(text) # 处理特殊的渲染问题 | |
| return text | |
| elif "</div>" in text: | |
| # careful input:html输入 | |
| return text | |
| else: | |
| # whatever input:非markdown输入 | |
| lines = text.split("\n") | |
| for i, line in enumerate(lines): | |
| lines[i] = lines[i].replace(" ", " ") # 空格转换为 | |
| text = "</br>".join(lines) # 换行符转换为</br> | |
| return text | |
| # 使用lru缓存 | |
| def simple_markdown_convertion(text): | |
| pre = '<div class="markdown-body">' | |
| suf = "</div>" | |
| if text.startswith(pre) and text.endswith(suf): | |
| return text # 已经被转化过,不需要再次转化 | |
| text = compat_non_markdown_input(text) # 兼容非markdown输入 | |
| text = markdown.markdown( | |
| text, | |
| extensions=["pymdownx.superfences", "tables", "pymdownx.highlight"], | |
| extension_configs=code_highlight_configs, | |
| ) | |
| return pre + text + suf | |
| def format_io(self, y): | |
| """ | |
| 将输入和输出解析为HTML格式。将y中最后一项的输入部分段落化,并将输出部分的Markdown和数学公式转换为HTML格式。 | |
| """ | |
| if y is None or y == []: | |
| return [] | |
| i_ask, gpt_reply = y[-1] | |
| i_ask = apply_gpt_academic_string_mask(i_ask, mode="show_render") | |
| gpt_reply = apply_gpt_academic_string_mask(gpt_reply, mode="show_render") | |
| # 当代码输出半截的时候,试着补上后个``` | |
| if gpt_reply is not None: | |
| gpt_reply = close_up_code_segment_during_stream(gpt_reply) | |
| # 处理提问与输出 | |
| y[-1] = ( | |
| # 输入部分 | |
| None if i_ask is None else simple_markdown_convertion(i_ask), | |
| # 输出部分 | |
| None if gpt_reply is None else markdown_convertion(gpt_reply), | |
| ) | |
| return y | |