Spaces:

markqiu
/

prinvest_mate

Sleeping

Tuchuanhuhuhu commited on Mar 21, 2023

Commit

90d39c3

1 Parent(s): 75a2593

大幅度改进了输入输出解析。

- 新的代码高亮模块。使用pygments实现。抛弃code_fence和code_highlite，它们的实现方式存在问题。
- 规范化GPT输出的markdown的功能。即使GPT输出不规范的Markdown列表，也能正常显示。
- 现在可以正常显示Shell脚本了。Shell脚本中的美元符不再与LaTex冲突。
- 删除parse_text函数。直接在Chatbot组件中渲染。讲道理我可以给gradio提一个pr。

Files changed (5) hide show

chat_func.py +7 -7
custom.css +69 -69
overwrites.py +2 -8
requirements.txt +1 -0
utils.py +70 -26

chat_func.py CHANGED Viewed

@@ -115,9 +115,9 @@ def stream_predict(
     history.append(construct_user(inputs))
     history.append(construct_assistant(""))
     if fake_input:
-        chatbot.append((parse_text(fake_input), ""))
     else:
-        chatbot.append((parse_text(inputs), ""))
     user_token_count = 0
     if len(all_token_counts) == 0:
         system_prompt_token_count = count_token(construct_system(system_prompt))
@@ -192,7 +192,7 @@ def stream_predict(
                     yield get_return_value()
                     break
                 history[-1] = construct_assistant(partial_words)
-                chatbot[-1] = (chatbot[-1][0], parse_text(partial_words+display_append))
                 all_token_counts[-1] += 1
                 yield get_return_value()
@@ -214,9 +214,9 @@ def predict_all(
     history.append(construct_user(inputs))
     history.append(construct_assistant(""))
     if fake_input:
-        chatbot.append((parse_text(fake_input), ""))
     else:
-        chatbot.append((parse_text(inputs), ""))
     all_token_counts.append(count_token(construct_user(inputs)))
     try:
         response = get_response(
@@ -242,7 +242,7 @@ def predict_all(
     response = json.loads(response.text)
     content = response["choices"][0]["message"]["content"]
     history[-1] = construct_assistant(content)
-    chatbot[-1] = (chatbot[-1][0], parse_text(content+display_append))
     total_token_count = response["usage"]["total_tokens"]
     all_token_counts[-1] = total_token_count - sum(all_token_counts)
     status_text = construct_token_message(total_token_count)
@@ -299,7 +299,7 @@ def predict(
     if len(openai_api_key) != 51:
         status_text = standard_error_msg + no_apikey_msg
         logging.info(status_text)
-        chatbot.append((parse_text(inputs), ""))
         if len(history) == 0:
             history.append(construct_user(inputs))
             history.append("")

     history.append(construct_user(inputs))
     history.append(construct_assistant(""))
     if fake_input:
+        chatbot.append((fake_input, ""))
     else:
+        chatbot.append((inputs, ""))
     user_token_count = 0
     if len(all_token_counts) == 0:
         system_prompt_token_count = count_token(construct_system(system_prompt))
                     yield get_return_value()
                     break
                 history[-1] = construct_assistant(partial_words)
+                chatbot[-1] = (chatbot[-1][0], partial_words+display_append)
                 all_token_counts[-1] += 1
                 yield get_return_value()
     history.append(construct_user(inputs))
     history.append(construct_assistant(""))
     if fake_input:
+        chatbot.append((fake_input, ""))
     else:
+        chatbot.append((inputs, ""))
     all_token_counts.append(count_token(construct_user(inputs)))
     try:
         response = get_response(
     response = json.loads(response.text)
     content = response["choices"][0]["message"]["content"]
     history[-1] = construct_assistant(content)
+    chatbot[-1] = (chatbot[-1][0], content+display_append)
     total_token_count = response["usage"]["total_tokens"]
     all_token_counts[-1] = total_token_count - sum(all_token_counts)
     status_text = construct_token_message(total_token_count)
     if len(openai_api_key) != 51:
         status_text = standard_error_msg + no_apikey_msg
         logging.info(status_text)
+        chatbot.append((inputs, ""))
         if len(history) == 0:
             history.append(construct_user(inputs))
             history.append("")

custom.css CHANGED Viewed

@@ -130,72 +130,72 @@ pre code {
     box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
 }
 /* 代码高亮样式 */
-.codehilite .hll { background-color: #49483e }
-.codehilite .c { color: #75715e } /* Comment */
-.codehilite .err { color: #960050; background-color: #1e0010 } /* Error */
-.codehilite .k { color: #66d9ef } /* Keyword */
-.codehilite .l { color: #ae81ff } /* Literal */
-.codehilite .n { color: #f8f8f2 } /* Name */
-.codehilite .o { color: #f92672 } /* Operator */
-.codehilite .p { color: #f8f8f2 } /* Punctuation */
-.codehilite .ch { color: #75715e } /* Comment.Hashbang */
-.codehilite .cm { color: #75715e } /* Comment.Multiline */
-.codehilite .cp { color: #75715e } /* Comment.Preproc */
-.codehilite .cpf { color: #75715e } /* Comment.PreprocFile */
-.codehilite .c1 { color: #75715e } /* Comment.Single */
-.codehilite .cs { color: #75715e } /* Comment.Special */
-.codehilite .gd { color: #f92672 } /* Generic.Deleted */
-.codehilite .ge { font-style: italic } /* Generic.Emph */
-.codehilite .gi { color: #a6e22e } /* Generic.Inserted */
-.codehilite .gs { font-weight: bold } /* Generic.Strong */
-.codehilite .gu { color: #75715e } /* Generic.Subheading */
-.codehilite .kc { color: #66d9ef } /* Keyword.Constant */
-.codehilite .kd { color: #66d9ef } /* Keyword.Declaration */
-.codehilite .kn { color: #f92672 } /* Keyword.Namespace */
-.codehilite .kp { color: #66d9ef } /* Keyword.Pseudo */
-.codehilite .kr { color: #66d9ef } /* Keyword.Reserved */
-.codehilite .kt { color: #66d9ef } /* Keyword.Type */
-.codehilite .ld { color: #e6db74 } /* Literal.Date */
-.codehilite .m { color: #ae81ff } /* Literal.Number */
-.codehilite .s { color: #e6db74 } /* Literal.String */
-.codehilite .na { color: #a6e22e } /* Name.Attribute */
-.codehilite .nb { color: #f8f8f2 } /* Name.Builtin */
-.codehilite .nc { color: #a6e22e } /* Name.Class */
-.codehilite .no { color: #66d9ef } /* Name.Constant */
-.codehilite .nd { color: #a6e22e } /* Name.Decorator */
-.codehilite .ni { color: #f8f8f2 } /* Name.Entity */
-.codehilite .ne { color: #a6e22e } /* Name.Exception */
-.codehilite .nf { color: #a6e22e } /* Name.Function */
-.codehilite .nl { color: #f8f8f2 } /* Name.Label */
-.codehilite .nn { color: #f8f8f2 } /* Name.Namespace */
-.codehilite .nx { color: #a6e22e } /* Name.Other */
-.codehilite .py { color: #f8f8f2 } /* Name.Property */
-.codehilite .nt { color: #f92672 } /* Name.Tag */
-.codehilite .nv { color: #f8f8f2 } /* Name.Variable */
-.codehilite .ow { color: #f92672 } /* Operator.Word */
-.codehilite .w { color: #f8f8f2 } /* Text.Whitespace */
-.codehilite .mb { color: #ae81ff } /* Literal.Number.Bin */
-.codehilite .mf { color: #ae81ff } /* Literal.Number.Float */
-.codehilite .mh { color: #ae81ff } /* Literal.Number.Hex */
-.codehilite .mi { color: #ae81ff } /* Literal.Number.Integer */
-.codehilite .mo { color: #ae81ff } /* Literal.Number.Oct */
-.codehilite .sa { color: #e6db74 } /* Literal.String.Affix */
-.codehilite .sb { color: #e6db74 } /* Literal.String.Backtick */
-.codehilite .sc { color: #e6db74 } /* Literal.String.Char */
-.codehilite .dl { color: #e6db74 } /* Literal.String.Delimiter */
-.codehilite .sd { color: #e6db74 } /* Literal.String.Doc */
-.codehilite .s2 { color: #e6db74 } /* Literal.String.Double */
-.codehilite .se { color: #ae81ff } /* Literal.String.Escape */
-.codehilite .sh { color: #e6db74 } /* Literal.String.Heredoc */
-.codehilite .si { color: #e6db74 } /* Literal.String.Interpol */
-.codehilite .sx { color: #e6db74 } /* Literal.String.Other */
-.codehilite .sr { color: #e6db74 } /* Literal.String.Regex */
-.codehilite .s1 { color: #e6db74 } /* Literal.String.Single */
-.codehilite .ss { color: #e6db74 } /* Literal.String.Symbol */
-.codehilite .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
-.codehilite .fm { color: #a6e22e } /* Name.Function.Magic */
-.codehilite .vc { color: #f8f8f2 } /* Name.Variable.Class */
-.codehilite .vg { color: #f8f8f2 } /* Name.Variable.Global */
-.codehilite .vi { color: #f8f8f2 } /* Name.Variable.Instance */
-.codehilite .vm { color: #f8f8f2 } /* Name.Variable.Magic */
-.codehilite .il { color: #ae81ff } /* Literal.Number.Integer.Long */

     box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
 }
 /* 代码高亮样式 */
+.highlight .hll { background-color: #49483e }
+.highlight .c { color: #75715e } /* Comment */
+.highlight .err { color: #960050; background-color: #1e0010 } /* Error */
+.highlight .k { color: #66d9ef } /* Keyword */
+.highlight .l { color: #ae81ff } /* Literal */
+.highlight .n { color: #f8f8f2 } /* Name */
+.highlight .o { color: #f92672 } /* Operator */
+.highlight .p { color: #f8f8f2 } /* Punctuation */
+.highlight .ch { color: #75715e } /* Comment.Hashbang */
+.highlight .cm { color: #75715e } /* Comment.Multiline */
+.highlight .cp { color: #75715e } /* Comment.Preproc */
+.highlight .cpf { color: #75715e } /* Comment.PreprocFile */
+.highlight .c1 { color: #75715e } /* Comment.Single */
+.highlight .cs { color: #75715e } /* Comment.Special */
+.highlight .gd { color: #f92672 } /* Generic.Deleted */
+.highlight .ge { font-style: italic } /* Generic.Emph */
+.highlight .gi { color: #a6e22e } /* Generic.Inserted */
+.highlight .gs { font-weight: bold } /* Generic.Strong */
+.highlight .gu { color: #75715e } /* Generic.Subheading */
+.highlight .kc { color: #66d9ef } /* Keyword.Constant */
+.highlight .kd { color: #66d9ef } /* Keyword.Declaration */
+.highlight .kn { color: #f92672 } /* Keyword.Namespace */
+.highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
+.highlight .kr { color: #66d9ef } /* Keyword.Reserved */
+.highlight .kt { color: #66d9ef } /* Keyword.Type */
+.highlight .ld { color: #e6db74 } /* Literal.Date */
+.highlight .m { color: #ae81ff } /* Literal.Number */
+.highlight .s { color: #e6db74 } /* Literal.String */
+.highlight .na { color: #a6e22e } /* Name.Attribute */
+.highlight .nb { color: #f8f8f2 } /* Name.Builtin */
+.highlight .nc { color: #a6e22e } /* Name.Class */
+.highlight .no { color: #66d9ef } /* Name.Constant */
+.highlight .nd { color: #a6e22e } /* Name.Decorator */
+.highlight .ni { color: #f8f8f2 } /* Name.Entity */
+.highlight .ne { color: #a6e22e } /* Name.Exception */
+.highlight .nf { color: #a6e22e } /* Name.Function */
+.highlight .nl { color: #f8f8f2 } /* Name.Label */
+.highlight .nn { color: #f8f8f2 } /* Name.Namespace */
+.highlight .nx { color: #a6e22e } /* Name.Other */
+.highlight .py { color: #f8f8f2 } /* Name.Property */
+.highlight .nt { color: #f92672 } /* Name.Tag */
+.highlight .nv { color: #f8f8f2 } /* Name.Variable */
+.highlight .ow { color: #f92672 } /* Operator.Word */
+.highlight .w { color: #f8f8f2 } /* Text.Whitespace */
+.highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
+.highlight .mf { color: #ae81ff } /* Literal.Number.Float */
+.highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
+.highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
+.highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
+.highlight .sa { color: #e6db74 } /* Literal.String.Affix */
+.highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
+.highlight .sc { color: #e6db74 } /* Literal.String.Char */
+.highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
+.highlight .sd { color: #e6db74 } /* Literal.String.Doc */
+.highlight .s2 { color: #e6db74 } /* Literal.String.Double */
+.highlight .se { color: #ae81ff } /* Literal.String.Escape */
+.highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
+.highlight .si { color: #e6db74 } /* Literal.String.Interpol */
+.highlight .sx { color: #e6db74 } /* Literal.String.Other */
+.highlight .sr { color: #e6db74 } /* Literal.String.Regex */
+.highlight .s1 { color: #e6db74 } /* Literal.String.Single */
+.highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
+.highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
+.highlight .fm { color: #a6e22e } /* Name.Function.Magic */
+.highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
+.highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
+.highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
+.highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
+.highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */

overwrites.py CHANGED Viewed

@@ -28,13 +28,7 @@ def postprocess(
     Returns:
         List of tuples representing the message and response. Each message and response will be a string of HTML.
     """
-    if y is None:
         return []
-    for i, (message, response) in enumerate(y):
-        y[i] = (
-            # None if message is None else markdown.markdown(message),
-            # None if response is None else markdown.markdown(response),
-            None if message is None else message,
-            None if response is None else mdtex2html.convert(response, extensions=['fenced_code','codehilite','tables']),
-        )
     return y

     Returns:
         List of tuples representing the message and response. Each message and response will be a string of HTML.
     """
+    if y is None or y == []:
         return []
+    y[-1] = (y[-1][0].replace("\n", "<br>"), convert_mdtext(y[-1][1]))
     return y

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ duckduckgo_search
 Pygments
 llama_index
 langchain

 Pygments
 llama_index
 langchain
+markdown

utils.py CHANGED Viewed

@@ -13,6 +13,11 @@ import re
 import gradio as gr
 from pypinyin import lazy_pinyin
 import tiktoken
 from presets import *
@@ -32,34 +37,73 @@ def count_token(message):
     length = len(encoding.encode(input_str))
     return length
-def parse_text(text):
-    in_code_block = False
-    in_list = False
-    new_lines = []
-    for line in text.split("\n"):
-        if line.strip().startswith("```"):
-            in_code_block = not in_code_block
-        else:
-            if re.match(r'(\*|-|\d+\.)\s', line):
-                if not in_list:
-                    in_list = True
-            elif in_list and line.strip() != "":
-                in_list = False
-                new_lines.append("")
-        if in_code_block:
-            if line.strip() != "":
-                new_lines.append(line)
-        elif in_list:
-            if line.strip() != "":
-                new_lines.append(line)
         else:
-            new_lines.append(line)
-    if in_code_block:
-        new_lines.append("```")
-    text = "\n".join(new_lines)
-    return text
 def construct_text(role, text):

 import gradio as gr
 from pypinyin import lazy_pinyin
 import tiktoken
+import mdtex2html
+from markdown import markdown
+from pygments import highlight
+from pygments.lexers import get_lexer_by_name
+from pygments.formatters import HtmlFormatter
 from presets import *
     length = len(encoding.encode(input_str))
     return length
+def markdown_to_html_with_syntax_highlight(md_str):
+    def replacer(match):
+        lang = match.group(1) or 'text'
+        code = match.group(2)
+        try:
+            lexer = get_lexer_by_name(lang, stripall=True)
+        except ValueError:
+            lexer = get_lexer_by_name("text", stripall=True)
+        formatter = HtmlFormatter()
+        highlighted_code = highlight(code, lexer, formatter)
+        return f"<pre><code class=\"{lang}\">{highlighted_code}</code></pre>"
+    code_block_pattern = r'```(\w+)?\n([\s\S]+?)\n```'
+    md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
+    html_str = markdown(md_str)
+    return html_str
+def normalize_markdown(md_text: str) -> str:
+    lines = md_text.split('\n')
+    normalized_lines = []
+    inside_list = False
+    for i, line in enumerate(lines):
+        if re.match(r'^(\d+\.|-|\*|\+)\s', line.strip()):
+            if not inside_list and i > 0 and lines[i - 1].strip() != '':
+                normalized_lines.append('')
+            inside_list = True
+            normalized_lines.append(line)
+        elif inside_list and line.strip() == '':
+            if i < len(lines) - 1 and not re.match(r'^(\d+\.|-|\*|\+)\s', lines[i + 1].strip()):
+                normalized_lines.append(line)
+            continue
         else:
+            inside_list = False
+            normalized_lines.append(line)
+    return '\n'.join(normalized_lines)
+def convert_mdtext(md_text):
+    code_block_pattern = re.compile(r'```(.*?)(?:```|$)', re.DOTALL)
+    code_blocks = code_block_pattern.findall(md_text)
+    non_code_parts = code_block_pattern.split(md_text)[::2]
+    result = []
+    for non_code, code in zip(non_code_parts, code_blocks + ['']):
+        if non_code.strip():
+            non_code = normalize_markdown(non_code)
+            result.append(mdtex2html.convert(non_code, extensions=['tables']))
+        if code.strip():
+            code = f"```{code}```"
+            code = markdown_to_html_with_syntax_highlight(code)
+            result.append(code)
+    result = "".join(result)
+    return result
+def detect_language(code):
+    if code.startswith("\n"):
+        first_line = ""
+    else:
+        first_line = code.strip().split('\n', 1)[0]
+    language = first_line.lower() if first_line else ''
+    code_without_language = code[len(first_line):].lstrip() if first_line else code
+    return language, code_without_language
 def construct_text(role, text):