FilipinosRich's picture
First draft of testing at scale
06cf97c
raw
history blame
1.92 kB
import boto3
import os
import json
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain, SequentialChain
llm = ChatOpenAI(temperature=0.0, openai_api_key=os.environ["OPENAI"])
def get_resume_string() -> str:
s3 = boto3.client(
's3',
region_name='eu-west-1'
)
resumes = s3.get_object(Bucket='ausy-datalake-drift-nonprod', Key='resume-matcher/raw/resume-dataset.csv')
resumes_list = resumes['Body'].read().decode('utf-8').splitlines()
resumes_list = [s.replace('. ', '.\n') for s in resumes_list]
resumes_list = [s.replace('•', '\n - ') for s in resumes_list]
# resume_string =''.join(resumes_list)
return resumes_list
def get_skills(resumes: str) -> list:
template_resumes_get_skills = """
Given the following string, delimited by <RESUMES> and </RESUMES> which contains resumes which are not properly formatted, categorize the resumes based on domain.
For each domain list the skills of the resumes that are part of that domain.
Create a JSON object where they keys are the domains and the values are a list containing the skills.
Return that JSON object only.
<RESUMES>
{resumes}
</RESUMES>
"""
prompt_vacancy_get_skills = ChatPromptTemplate.from_template(template=template_resumes_get_skills)
resume_skills = LLMChain(llm=llm, prompt=prompt_vacancy_get_skills, output_key="resume_skills")
get_skills_resumes_chain = SequentialChain(
chains=[resume_skills],
input_variables=["resumes"],
output_variables=["resume_skills"],
verbose=False
)
result = get_skills_resumes_chain({"resumes": resumes})
print(result)
resume_skills = json.loads(result['resume_skills'])
if __name__ == "__main__":
resumes = get_resume_string()
for x in resumes:
get_skills(x)