File size: 4,940 Bytes
c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c 5e0a689 c5ea37c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import copy
import json
import os
import re
import requests
from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import huqie
from rag.settings import cron_logger
from rag.utils import rmSpace
def chunk(filename, binary=None, callback=None, **kwargs):
if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
raise NotImplementedError("file type not supported yet(pdf supported)")
url = os.environ.get("INFINIFLOW_SERVER")
if not url:
raise EnvironmentError(
"Please set environment variable: 'INFINIFLOW_SERVER'")
token = os.environ.get("INFINIFLOW_TOKEN")
if not token:
raise EnvironmentError(
"Please set environment variable: 'INFINIFLOW_TOKEN'")
if not binary:
with open(filename, "rb") as f:
binary = f.read()
def remote_call():
nonlocal filename, binary
for _ in range(3):
try:
res = requests.post(url + "/v1/layout/resume/", files=[(filename, binary)],
headers={"Authorization": token}, timeout=180)
res = res.json()
if res["retcode"] != 0:
raise RuntimeError(res["retmsg"])
return res["data"]
except RuntimeError as e:
raise e
except Exception as e:
cron_logger.error("resume parsing:" + str(e))
callback(0.2, "Resume parsing is going on...")
resume = remote_call()
callback(0.6, "Done parsing. Chunking...")
print(json.dumps(resume, ensure_ascii=False, indent=2))
field_map = {
"name_kwd": "姓名/名字",
"gender_kwd": "性别(男,女)",
"age_int": "年龄/岁/年纪",
"phone_kwd": "电话/手机/微信",
"email_tks": "email/e-mail/邮箱",
"position_name_tks": "职位/职能/岗位/职责",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
"hightest_degree_kwd": "最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)",
"first_degree_kwd": "第一学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)",
"first_major_tks": "第一学历专业",
"first_school_name_tks": "第一学历毕业学校",
"edu_first_fea_kwd": "第一学历标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)",
"degree_kwd": "过往学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)",
"major_tks": "学过的专业/过往专业",
"school_name_tks": "学校/毕业院校",
"sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)",
"edu_fea_kwd": "教育标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"birth_dt": "生日/出生年份",
"corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"edu_end_int": "毕业年份",
"expect_city_names_tks": "期望城市",
"industry_name_tks": "所在行业"
}
titles = []
for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]:
v = resume.get(n, "")
if isinstance(v, list):
v = v[0]
if n.find("tks") > 0:
v = rmSpace(v)
titles.append(str(v))
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie("-".join(titles) + "-简历")
}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
pairs = []
for n, m in field_map.items():
if not resume.get(n):
continue
v = resume[n]
if isinstance(v, list):
v = " ".join(v)
if n.find("tks") > 0:
v = rmSpace(v)
pairs.append((m, str(v)))
doc["content_with_weight"] = "\n".join(
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
for n, _ in field_map.items():
doc[n] = resume[n]
print(doc)
KnowledgebaseService.update_parser_config(
kwargs["kb_id"], {"field_map": field_map})
return [doc]
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)
|