Spaces:
Sleeping
Sleeping
Update 2-column pdf; Update new config type; Update new proxy method (#479)
Browse files* Update 2-column pdf; Update new config type; Update new proxy method
* 更新requirements
---------
Co-authored-by: Tuchuanhuhuhu <[email protected]>
- .gitignore +1 -1
- ChuanhuChatbot.py +1 -47
- config_example.json +11 -0
- modules/chat_func.py +9 -9
- modules/config.py +113 -0
- modules/llama_func.py +10 -5
- modules/openai_func.py +7 -8
- modules/pdf_func.py +180 -0
- modules/utils.py +3 -25
- requirements.txt +1 -0
.gitignore
CHANGED
|
@@ -134,6 +134,6 @@ dmypy.json
|
|
| 134 |
**/.DS_Store
|
| 135 |
|
| 136 |
api_key.txt
|
| 137 |
-
|
| 138 |
auth.json
|
| 139 |
.idea
|
|
|
|
| 134 |
**/.DS_Store
|
| 135 |
|
| 136 |
api_key.txt
|
| 137 |
+
config.json
|
| 138 |
auth.json
|
| 139 |
.idea
|
ChuanhuChatbot.py
CHANGED
|
@@ -5,59 +5,13 @@ import sys
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
|
|
|
|
| 8 |
from modules.utils import *
|
| 9 |
from modules.presets import *
|
| 10 |
from modules.overwrites import *
|
| 11 |
from modules.chat_func import *
|
| 12 |
from modules.openai_func import get_usage
|
| 13 |
|
| 14 |
-
logging.basicConfig(
|
| 15 |
-
level=logging.DEBUG,
|
| 16 |
-
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
my_api_key = "" # 在这里输入你的 API 密钥
|
| 20 |
-
|
| 21 |
-
# if we are running in Docker
|
| 22 |
-
if os.environ.get("dockerrun") == "yes":
|
| 23 |
-
dockerflag = True
|
| 24 |
-
else:
|
| 25 |
-
dockerflag = False
|
| 26 |
-
|
| 27 |
-
authflag = False
|
| 28 |
-
auth_list = []
|
| 29 |
-
|
| 30 |
-
if not my_api_key:
|
| 31 |
-
my_api_key = os.environ.get("my_api_key")
|
| 32 |
-
if dockerflag:
|
| 33 |
-
if my_api_key == "empty":
|
| 34 |
-
logging.error("Please give a api key!")
|
| 35 |
-
sys.exit(1)
|
| 36 |
-
# auth
|
| 37 |
-
username = os.environ.get("USERNAME")
|
| 38 |
-
password = os.environ.get("PASSWORD")
|
| 39 |
-
if not (isinstance(username, type(None)) or isinstance(password, type(None))):
|
| 40 |
-
auth_list.append((os.environ.get("USERNAME"), os.environ.get("PASSWORD")))
|
| 41 |
-
authflag = True
|
| 42 |
-
else:
|
| 43 |
-
if (
|
| 44 |
-
not my_api_key
|
| 45 |
-
and os.path.exists("api_key.txt")
|
| 46 |
-
and os.path.getsize("api_key.txt")
|
| 47 |
-
):
|
| 48 |
-
with open("api_key.txt", "r") as f:
|
| 49 |
-
my_api_key = f.read().strip()
|
| 50 |
-
if os.path.exists("auth.json"):
|
| 51 |
-
authflag = True
|
| 52 |
-
with open("auth.json", "r", encoding='utf-8') as f:
|
| 53 |
-
auth = json.load(f)
|
| 54 |
-
for _ in auth:
|
| 55 |
-
if auth[_]["username"] and auth[_]["password"]:
|
| 56 |
-
auth_list.append((auth[_]["username"], auth[_]["password"]))
|
| 57 |
-
else:
|
| 58 |
-
logging.error("请检查auth.json文件中的用户名和密码!")
|
| 59 |
-
sys.exit(1)
|
| 60 |
-
|
| 61 |
gr.Chatbot.postprocess = postprocess
|
| 62 |
PromptHelper.compact_text_chunks = compact_text_chunks
|
| 63 |
|
|
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
|
| 8 |
+
from modules.config import *
|
| 9 |
from modules.utils import *
|
| 10 |
from modules.presets import *
|
| 11 |
from modules.overwrites import *
|
| 12 |
from modules.chat_func import *
|
| 13 |
from modules.openai_func import get_usage
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
gr.Chatbot.postprocess = postprocess
|
| 16 |
PromptHelper.compact_text_chunks = compact_text_chunks
|
| 17 |
|
config_example.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"openai_api_key": "sk-xxxxxxxxxxxxxxxxxxxxxxxxx",
|
| 3 |
+
"https_proxy": "http://127.0.0.1:1079",
|
| 4 |
+
"http_proxy": "http://127.0.0.1:1079",
|
| 5 |
+
"advanced_pdf_kwargs": {
|
| 6 |
+
"two_column": true
|
| 7 |
+
},
|
| 8 |
+
"users": [
|
| 9 |
+
["root", "root"]
|
| 10 |
+
]
|
| 11 |
+
}
|
modules/chat_func.py
CHANGED
|
@@ -21,6 +21,7 @@ from modules.presets import *
|
|
| 21 |
from modules.llama_func import *
|
| 22 |
from modules.utils import *
|
| 23 |
import modules.shared as shared
|
|
|
|
| 24 |
|
| 25 |
# logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s")
|
| 26 |
|
|
@@ -61,20 +62,19 @@ def get_response(
|
|
| 61 |
else:
|
| 62 |
timeout = timeout_all
|
| 63 |
|
| 64 |
-
proxies = get_proxies()
|
| 65 |
|
| 66 |
# 如果有自定义的api-url,使用自定义url发送请求,否则使用默认设置发送请求
|
| 67 |
if shared.state.api_url != API_URL:
|
| 68 |
logging.info(f"使用自定义API URL: {shared.state.api_url}")
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
|
| 79 |
return response
|
| 80 |
|
|
|
|
| 21 |
from modules.llama_func import *
|
| 22 |
from modules.utils import *
|
| 23 |
import modules.shared as shared
|
| 24 |
+
from modules.config import retrieve_proxy
|
| 25 |
|
| 26 |
# logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s")
|
| 27 |
|
|
|
|
| 62 |
else:
|
| 63 |
timeout = timeout_all
|
| 64 |
|
|
|
|
| 65 |
|
| 66 |
# 如果有自定义的api-url,使用自定义url发送请求,否则使用默认设置发送请求
|
| 67 |
if shared.state.api_url != API_URL:
|
| 68 |
logging.info(f"使用自定义API URL: {shared.state.api_url}")
|
| 69 |
|
| 70 |
+
with retrieve_proxy():
|
| 71 |
+
response = requests.post(
|
| 72 |
+
shared.state.api_url,
|
| 73 |
+
headers=headers,
|
| 74 |
+
json=payload,
|
| 75 |
+
stream=True,
|
| 76 |
+
timeout=timeout,
|
| 77 |
+
)
|
| 78 |
|
| 79 |
return response
|
| 80 |
|
modules/config.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from contextlib import contextmanager
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
import sys
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
"my_api_key",
|
| 9 |
+
"authflag",
|
| 10 |
+
"auth_list",
|
| 11 |
+
"dockerflag",
|
| 12 |
+
"retrieve_proxy",
|
| 13 |
+
"log_level",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
# 添加一个统一的config文件,避免文件过多造成的疑惑(优先级最低)
|
| 17 |
+
# 同时,也可以为后续支持自定义功能提供config的帮助
|
| 18 |
+
if os.path.exists("config.json"):
|
| 19 |
+
with open("config.json", "r", encoding='utf-8') as f:
|
| 20 |
+
config = json.load(f)
|
| 21 |
+
else:
|
| 22 |
+
config = {}
|
| 23 |
+
|
| 24 |
+
## 处理docker if we are running in Docker
|
| 25 |
+
dockerflag = config.get("dockerflag", False)
|
| 26 |
+
if os.environ.get("dockerrun") == "yes":
|
| 27 |
+
dockerflag = True
|
| 28 |
+
|
| 29 |
+
## 处理 api-key 以及 允许的用户列表
|
| 30 |
+
my_api_key = config.get("openai_api_key", "") # 在这里输入你的 API 密钥
|
| 31 |
+
authflag = "users" in config
|
| 32 |
+
auth_list = config.get("users", []) # 实际上是使用者的列表
|
| 33 |
+
my_api_key = os.environ.get("my_api_key", my_api_key)
|
| 34 |
+
if dockerflag:
|
| 35 |
+
if my_api_key == "empty":
|
| 36 |
+
logging.error("Please give a api key!")
|
| 37 |
+
sys.exit(1)
|
| 38 |
+
# auth
|
| 39 |
+
username = os.environ.get("USERNAME")
|
| 40 |
+
password = os.environ.get("PASSWORD")
|
| 41 |
+
if not (isinstance(username, type(None)) or isinstance(password, type(None))):
|
| 42 |
+
auth_list.append((os.environ.get("USERNAME"), os.environ.get("PASSWORD")))
|
| 43 |
+
authflag = True
|
| 44 |
+
else:
|
| 45 |
+
if (
|
| 46 |
+
not my_api_key
|
| 47 |
+
and os.path.exists("api_key.txt")
|
| 48 |
+
and os.path.getsize("api_key.txt")
|
| 49 |
+
):
|
| 50 |
+
with open("api_key.txt", "r") as f:
|
| 51 |
+
my_api_key = f.read().strip()
|
| 52 |
+
if os.path.exists("auth.json"):
|
| 53 |
+
authflag = True
|
| 54 |
+
with open("auth.json", "r", encoding='utf-8') as f:
|
| 55 |
+
auth = json.load(f)
|
| 56 |
+
for _ in auth:
|
| 57 |
+
if auth[_]["username"] and auth[_]["password"]:
|
| 58 |
+
auth_list.append((auth[_]["username"], auth[_]["password"]))
|
| 59 |
+
else:
|
| 60 |
+
logging.error("请检查auth.json文件中的用户名和密码!")
|
| 61 |
+
sys.exit(1)
|
| 62 |
+
|
| 63 |
+
@contextmanager
|
| 64 |
+
def retrieve_openai_api(api_key = None):
|
| 65 |
+
old_api_key = os.environ.get("OPENAI_API_KEY", "")
|
| 66 |
+
if api_key is None:
|
| 67 |
+
os.environ["OPENAI_API_KEY"] = my_api_key
|
| 68 |
+
yield my_api_key
|
| 69 |
+
else:
|
| 70 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
| 71 |
+
yield api_key
|
| 72 |
+
os.environ["OPENAI_API_KEY"] = old_api_key
|
| 73 |
+
|
| 74 |
+
## 处理log
|
| 75 |
+
log_level = config.get("log_level", "INFO")
|
| 76 |
+
logging.basicConfig(
|
| 77 |
+
level=log_level,
|
| 78 |
+
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
## 处理代理:
|
| 82 |
+
http_proxy = config.get("http_proxy", "")
|
| 83 |
+
https_proxy = config.get("https_proxy", "")
|
| 84 |
+
http_proxy = os.environ.get("HTTP_PROXY", http_proxy)
|
| 85 |
+
https_proxy = os.environ.get("HTTPS_PROXY", https_proxy)
|
| 86 |
+
|
| 87 |
+
# 重置系统变量,在不需要设置的时候不设置环境变量,以免引起全局代理报错
|
| 88 |
+
os.environ["HTTP_PROXY"] = ""
|
| 89 |
+
os.environ["HTTPS_PROXY"] = ""
|
| 90 |
+
|
| 91 |
+
@contextmanager
|
| 92 |
+
def retrieve_proxy(proxy=None):
|
| 93 |
+
"""
|
| 94 |
+
1, 如果proxy = NONE,设置环境变量,并返回最新设置的代理
|
| 95 |
+
2,如果proxy != NONE,更新当前的代理配置,但是不更新环境变量
|
| 96 |
+
"""
|
| 97 |
+
global http_proxy, https_proxy
|
| 98 |
+
if proxy is not None:
|
| 99 |
+
http_proxy = proxy
|
| 100 |
+
https_proxy = proxy
|
| 101 |
+
yield http_proxy, https_proxy
|
| 102 |
+
else:
|
| 103 |
+
old_var = os.environ["HTTP_PROXY"], os.environ["HTTPS_PROXY"]
|
| 104 |
+
os.environ["HTTP_PROXY"] = http_proxy
|
| 105 |
+
os.environ["HTTPS_PROXY"] = https_proxy
|
| 106 |
+
yield http_proxy, https_proxy # return new proxy
|
| 107 |
+
|
| 108 |
+
# return old proxy
|
| 109 |
+
os.environ["HTTP_PROXY"], os.environ["HTTPS_PROXY"] = old_var
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
## 处理advance pdf
|
| 113 |
+
advance_pdf = config.get("advance_pdf", {})
|
modules/llama_func.py
CHANGED
|
@@ -46,11 +46,16 @@ def get_documents(file_src):
|
|
| 46 |
logging.info(f"loading file: {file.name}")
|
| 47 |
if os.path.splitext(file.name)[1] == ".pdf":
|
| 48 |
logging.debug("Loading PDF...")
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
text_raw = pdftext
|
| 55 |
elif os.path.splitext(file.name)[1] == ".docx":
|
| 56 |
logging.debug("Loading DOCX...")
|
|
|
|
| 46 |
logging.info(f"loading file: {file.name}")
|
| 47 |
if os.path.splitext(file.name)[1] == ".pdf":
|
| 48 |
logging.debug("Loading PDF...")
|
| 49 |
+
try:
|
| 50 |
+
from modules.pdf_func import parse_pdf
|
| 51 |
+
from modules.config import advance_pdf
|
| 52 |
+
text = parse_pdf(file.name, advance_pdf.get("two_column", False)).text
|
| 53 |
+
except:
|
| 54 |
+
pdftext = ""
|
| 55 |
+
with open(file.name, 'rb') as pdfFileObj:
|
| 56 |
+
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
| 57 |
+
for page in tqdm(pdfReader.pages):
|
| 58 |
+
pdftext += page.extract_text()
|
| 59 |
text_raw = pdftext
|
| 60 |
elif os.path.splitext(file.name)[1] == ".docx":
|
| 61 |
logging.debug("Loading DOCX...")
|
modules/openai_func.py
CHANGED
|
@@ -11,7 +11,7 @@ from modules.presets import (
|
|
| 11 |
)
|
| 12 |
|
| 13 |
from modules import shared
|
| 14 |
-
from modules.
|
| 15 |
import os, datetime
|
| 16 |
|
| 17 |
def get_billing_data(openai_api_key, billing_url):
|
|
@@ -21,13 +21,12 @@ def get_billing_data(openai_api_key, billing_url):
|
|
| 21 |
}
|
| 22 |
|
| 23 |
timeout = timeout_all
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
)
|
| 31 |
|
| 32 |
if response.status_code == 200:
|
| 33 |
data = response.json()
|
|
|
|
| 11 |
)
|
| 12 |
|
| 13 |
from modules import shared
|
| 14 |
+
from modules.config import retrieve_proxy
|
| 15 |
import os, datetime
|
| 16 |
|
| 17 |
def get_billing_data(openai_api_key, billing_url):
|
|
|
|
| 21 |
}
|
| 22 |
|
| 23 |
timeout = timeout_all
|
| 24 |
+
with retrieve_proxy():
|
| 25 |
+
response = requests.get(
|
| 26 |
+
billing_url,
|
| 27 |
+
headers=headers,
|
| 28 |
+
timeout=timeout,
|
| 29 |
+
)
|
|
|
|
| 30 |
|
| 31 |
if response.status_code == 200:
|
| 32 |
data = response.json()
|
modules/pdf_func.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from types import SimpleNamespace
|
| 2 |
+
import pdfplumber
|
| 3 |
+
import logging
|
| 4 |
+
from llama_index import Document
|
| 5 |
+
|
| 6 |
+
def prepare_table_config(crop_page):
|
| 7 |
+
"""Prepare table查找边界, 要求page为原始page
|
| 8 |
+
|
| 9 |
+
From https://github.com/jsvine/pdfplumber/issues/242
|
| 10 |
+
"""
|
| 11 |
+
page = crop_page.root_page # root/parent
|
| 12 |
+
cs = page.curves + page.edges
|
| 13 |
+
def curves_to_edges():
|
| 14 |
+
"""See https://github.com/jsvine/pdfplumber/issues/127"""
|
| 15 |
+
edges = []
|
| 16 |
+
for c in cs:
|
| 17 |
+
edges += pdfplumber.utils.rect_to_edges(c)
|
| 18 |
+
return edges
|
| 19 |
+
edges = curves_to_edges()
|
| 20 |
+
return {
|
| 21 |
+
"vertical_strategy": "explicit",
|
| 22 |
+
"horizontal_strategy": "explicit",
|
| 23 |
+
"explicit_vertical_lines": edges,
|
| 24 |
+
"explicit_horizontal_lines": edges,
|
| 25 |
+
"intersection_y_tolerance": 10,
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
def get_text_outside_table(crop_page):
|
| 29 |
+
ts = prepare_table_config(crop_page)
|
| 30 |
+
if len(ts["explicit_vertical_lines"]) == 0 or len(ts["explicit_horizontal_lines"]) == 0:
|
| 31 |
+
return crop_page
|
| 32 |
+
|
| 33 |
+
### Get the bounding boxes of the tables on the page.
|
| 34 |
+
bboxes = [table.bbox for table in crop_page.root_page.find_tables(table_settings=ts)]
|
| 35 |
+
def not_within_bboxes(obj):
|
| 36 |
+
"""Check if the object is in any of the table's bbox."""
|
| 37 |
+
def obj_in_bbox(_bbox):
|
| 38 |
+
"""See https://github.com/jsvine/pdfplumber/blob/stable/pdfplumber/table.py#L404"""
|
| 39 |
+
v_mid = (obj["top"] + obj["bottom"]) / 2
|
| 40 |
+
h_mid = (obj["x0"] + obj["x1"]) / 2
|
| 41 |
+
x0, top, x1, bottom = _bbox
|
| 42 |
+
return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
|
| 43 |
+
return not any(obj_in_bbox(__bbox) for __bbox in bboxes)
|
| 44 |
+
|
| 45 |
+
return crop_page.filter(not_within_bboxes)
|
| 46 |
+
# 请使用 LaTeX 表达公式,行内公式以 $ 包裹,行间公式以 $$ 包裹
|
| 47 |
+
|
| 48 |
+
extract_words = lambda page: page.extract_words(keep_blank_chars=True, y_tolerance=0, x_tolerance=1, extra_attrs=["fontname", "size", "object_type"])
|
| 49 |
+
# dict_keys(['text', 'x0', 'x1', 'top', 'doctop', 'bottom', 'upright', 'direction', 'fontname', 'size'])
|
| 50 |
+
|
| 51 |
+
def get_title_with_cropped_page(first_page):
|
| 52 |
+
title = [] # 处理标题
|
| 53 |
+
x0,top,x1,bottom = first_page.bbox # 获取页面边框
|
| 54 |
+
|
| 55 |
+
for word in extract_words(first_page):
|
| 56 |
+
word = SimpleNamespace(**word)
|
| 57 |
+
|
| 58 |
+
if word.size >= 14:
|
| 59 |
+
title.append(word.text)
|
| 60 |
+
title_bottom = word.bottom
|
| 61 |
+
elif word.text == "Abstract": # 获取页面abstract
|
| 62 |
+
top = word.top
|
| 63 |
+
|
| 64 |
+
user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
|
| 65 |
+
# 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
|
| 66 |
+
return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
|
| 67 |
+
|
| 68 |
+
def get_column_cropped_pages(pages, two_column=True):
|
| 69 |
+
new_pages = []
|
| 70 |
+
for page in pages:
|
| 71 |
+
if two_column:
|
| 72 |
+
left = page.within_bbox((0, 0, page.width/2, page.height),relative=True)
|
| 73 |
+
right = page.within_bbox((page.width/2, 0, page.width, page.height), relative=True)
|
| 74 |
+
new_pages.append(left)
|
| 75 |
+
new_pages.append(right)
|
| 76 |
+
else:
|
| 77 |
+
new_pages.append(page)
|
| 78 |
+
|
| 79 |
+
return new_pages
|
| 80 |
+
|
| 81 |
+
def parse_pdf(filename, two_column = True):
|
| 82 |
+
level = logging.getLogger().level
|
| 83 |
+
if level == logging.getLevelName("DEBUG"):
|
| 84 |
+
logging.getLogger().setLevel("INFO")
|
| 85 |
+
|
| 86 |
+
with pdfplumber.open(filename) as pdf:
|
| 87 |
+
title, user_info, first_page = get_title_with_cropped_page(pdf.pages[0])
|
| 88 |
+
new_pages = get_column_cropped_pages([first_page] + pdf.pages[1:], two_column)
|
| 89 |
+
|
| 90 |
+
chapters = []
|
| 91 |
+
# tuple (chapter_name, [pageid] (start,stop), chapter_text)
|
| 92 |
+
create_chapter = lambda page_start,name_top,name_bottom: SimpleNamespace(
|
| 93 |
+
name=[],
|
| 94 |
+
name_top=name_top,
|
| 95 |
+
name_bottom=name_bottom,
|
| 96 |
+
record_chapter_name = True,
|
| 97 |
+
|
| 98 |
+
page_start=page_start,
|
| 99 |
+
page_stop=None,
|
| 100 |
+
|
| 101 |
+
text=[],
|
| 102 |
+
)
|
| 103 |
+
cur_chapter = None
|
| 104 |
+
|
| 105 |
+
# 按页遍历PDF文档
|
| 106 |
+
for idx, page in enumerate(new_pages):
|
| 107 |
+
page = get_text_outside_table(page)
|
| 108 |
+
|
| 109 |
+
# 按行遍历页面文本
|
| 110 |
+
for word in extract_words(page):
|
| 111 |
+
word = SimpleNamespace(**word)
|
| 112 |
+
|
| 113 |
+
# 检查行文本是否以12号字体打印,如果是,则将其作为新章节开始
|
| 114 |
+
if word.size >= 11: # 出现chapter name
|
| 115 |
+
if cur_chapter is None:
|
| 116 |
+
cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
|
| 117 |
+
elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
|
| 118 |
+
# 不再继续写chapter name
|
| 119 |
+
cur_chapter.page_stop = page.page_number # stop id
|
| 120 |
+
chapters.append(cur_chapter)
|
| 121 |
+
# 重置当前chapter信息
|
| 122 |
+
cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
|
| 123 |
+
|
| 124 |
+
# print(word.size, word.top, word.bottom, word.text)
|
| 125 |
+
cur_chapter.name.append(word.text)
|
| 126 |
+
else:
|
| 127 |
+
cur_chapter.record_chapter_name = False # chapter name 结束
|
| 128 |
+
cur_chapter.text.append(word.text)
|
| 129 |
+
else:
|
| 130 |
+
# 处理最后一个章节
|
| 131 |
+
cur_chapter.page_stop = page.page_number # stop id
|
| 132 |
+
chapters.append(cur_chapter)
|
| 133 |
+
|
| 134 |
+
for i in chapters:
|
| 135 |
+
logging.info(f"section: {i.name} pages:{i.page_start, i.page_stop} word-count:{len(i.text)}")
|
| 136 |
+
logging.debug(" ".join(i.text))
|
| 137 |
+
|
| 138 |
+
title = " ".join(title)
|
| 139 |
+
user_info = " ".join(user_info)
|
| 140 |
+
text = f"Article Title: {title}, Information:{user_info}\n"
|
| 141 |
+
for idx, chapter in enumerate(chapters):
|
| 142 |
+
chapter.name = " ".join(chapter.name)
|
| 143 |
+
text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
|
| 144 |
+
|
| 145 |
+
logging.getLogger().setLevel(level)
|
| 146 |
+
return Document(text=text, extra_info={"title": title})
|
| 147 |
+
|
| 148 |
+
BASE_POINTS = """
|
| 149 |
+
1. Who are the authors?
|
| 150 |
+
2. What is the process of the proposed method?
|
| 151 |
+
3. What is the performance of the proposed method? Please note down its performance metrics.
|
| 152 |
+
4. What are the baseline models and their performances? Please note down these baseline methods.
|
| 153 |
+
5. What dataset did this paper use?
|
| 154 |
+
"""
|
| 155 |
+
|
| 156 |
+
READING_PROMPT = """
|
| 157 |
+
You are a researcher helper bot. You can help the user with research paper reading and summarizing. \n
|
| 158 |
+
Now I am going to send you a paper. You need to read it and summarize it for me part by part. \n
|
| 159 |
+
When you are reading, You need to focus on these key points:{}
|
| 160 |
+
"""
|
| 161 |
+
|
| 162 |
+
READING_PROMT_V2 = """
|
| 163 |
+
You are a researcher helper bot. You can help the user with research paper reading and summarizing. \n
|
| 164 |
+
Now I am going to send you a paper. You need to read it and summarize it for me part by part. \n
|
| 165 |
+
When you are reading, You need to focus on these key points:{},
|
| 166 |
+
|
| 167 |
+
And You need to generate a brief but informative title for this part.
|
| 168 |
+
Your return format:
|
| 169 |
+
- title: '...'
|
| 170 |
+
- summary: '...'
|
| 171 |
+
"""
|
| 172 |
+
|
| 173 |
+
SUMMARY_PROMPT = "You are a researcher helper bot. Now you need to read the summaries of a research paper."
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
if __name__ == '__main__':
|
| 177 |
+
# Test code
|
| 178 |
+
z = parse_pdf("./build/test.pdf")
|
| 179 |
+
print(z["user_info"])
|
| 180 |
+
print(z["title"])
|
modules/utils.py
CHANGED
|
@@ -24,11 +24,7 @@ from pygments.formatters import HtmlFormatter
|
|
| 24 |
|
| 25 |
from modules.presets import *
|
| 26 |
import modules.shared as shared
|
| 27 |
-
|
| 28 |
-
logging.basicConfig(
|
| 29 |
-
level=logging.INFO,
|
| 30 |
-
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
|
| 31 |
-
)
|
| 32 |
|
| 33 |
if TYPE_CHECKING:
|
| 34 |
from typing import TypedDict
|
|
@@ -333,8 +329,7 @@ def reset_textbox():
|
|
| 333 |
|
| 334 |
def reset_default():
|
| 335 |
newurl = shared.state.reset_api_url()
|
| 336 |
-
|
| 337 |
-
os.environ.pop("https_proxy", None)
|
| 338 |
return gr.update(value=newurl), gr.update(value=""), "API URL 和代理已重置"
|
| 339 |
|
| 340 |
|
|
@@ -346,6 +341,7 @@ def change_api_url(url):
|
|
| 346 |
|
| 347 |
|
| 348 |
def change_proxy(proxy):
|
|
|
|
| 349 |
os.environ["HTTPS_PROXY"] = proxy
|
| 350 |
msg = f"代理更改为了{proxy}"
|
| 351 |
logging.info(msg)
|
|
@@ -443,24 +439,6 @@ def transfer_input(inputs):
|
|
| 443 |
)
|
| 444 |
|
| 445 |
|
| 446 |
-
def get_proxies():
|
| 447 |
-
# 获取环境变量中的代理设置
|
| 448 |
-
http_proxy = os.environ.get("HTTP_PROXY") or os.environ.get("http_proxy")
|
| 449 |
-
https_proxy = os.environ.get("HTTPS_PROXY") or os.environ.get("https_proxy")
|
| 450 |
-
|
| 451 |
-
# 如果存在代理设置,使用它们
|
| 452 |
-
proxies = {}
|
| 453 |
-
if http_proxy:
|
| 454 |
-
logging.info(f"使用 HTTP 代理: {http_proxy}")
|
| 455 |
-
proxies["http"] = http_proxy
|
| 456 |
-
if https_proxy:
|
| 457 |
-
logging.info(f"使用 HTTPS 代理: {https_proxy}")
|
| 458 |
-
proxies["https"] = https_proxy
|
| 459 |
-
|
| 460 |
-
if proxies == {}:
|
| 461 |
-
proxies = None
|
| 462 |
-
|
| 463 |
-
return proxies
|
| 464 |
|
| 465 |
def run(command, desc=None, errdesc=None, custom_env=None, live=False):
|
| 466 |
if desc is not None:
|
|
|
|
| 24 |
|
| 25 |
from modules.presets import *
|
| 26 |
import modules.shared as shared
|
| 27 |
+
from modules.config import retrieve_proxy
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
if TYPE_CHECKING:
|
| 30 |
from typing import TypedDict
|
|
|
|
| 329 |
|
| 330 |
def reset_default():
|
| 331 |
newurl = shared.state.reset_api_url()
|
| 332 |
+
retrieve_proxy("")
|
|
|
|
| 333 |
return gr.update(value=newurl), gr.update(value=""), "API URL 和代理已重置"
|
| 334 |
|
| 335 |
|
|
|
|
| 341 |
|
| 342 |
|
| 343 |
def change_proxy(proxy):
|
| 344 |
+
retrieve_proxy(proxy)
|
| 345 |
os.environ["HTTPS_PROXY"] = proxy
|
| 346 |
msg = f"代理更改为了{proxy}"
|
| 347 |
logging.info(msg)
|
|
|
|
| 439 |
)
|
| 440 |
|
| 441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
def run(command, desc=None, errdesc=None, custom_env=None, live=False):
|
| 444 |
if desc is not None:
|
requirements.txt
CHANGED
|
@@ -11,3 +11,4 @@ llama_index
|
|
| 11 |
langchain
|
| 12 |
markdown
|
| 13 |
PyPDF2
|
|
|
|
|
|
| 11 |
langchain
|
| 12 |
markdown
|
| 13 |
PyPDF2
|
| 14 |
+
pdfplumber
|