import boto3
import os
import json
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain, SequentialChain
llm = ChatOpenAI(temperature=0.0, openai_api_key=os.environ["OPENAI"])
def get_resume_string() -> str:
s3 = boto3.client(
's3',
region_name='eu-west-1'
)
resumes = s3.get_object(Bucket='ausy-datalake-drift-nonprod', Key='resume-matcher/raw/resume-dataset.csv')
resumes_list = resumes['Body'].read().decode('utf-8').splitlines()
resumes_list = [s.replace('. ', '.\n') for s in resumes_list]
resumes_list = [s.replace('â¢', '\n - ') for s in resumes_list]
# resume_string =''.join(resumes_list)
return resumes_list
def get_skills(resumes: str) -> list:
template_resumes_get_skills = """
Given the following string, delimited by and which contains resumes which are not properly formatted, categorize the resumes based on domain.
For each domain list the skills of the resumes that are part of that domain.
Create a JSON object where they keys are the domains and the values are a list containing the skills.
Return that JSON object only.
{resumes}
"""
prompt_vacancy_get_skills = ChatPromptTemplate.from_template(template=template_resumes_get_skills)
resume_skills = LLMChain(llm=llm, prompt=prompt_vacancy_get_skills, output_key="resume_skills")
get_skills_resumes_chain = SequentialChain(
chains=[resume_skills],
input_variables=["resumes"],
output_variables=["resume_skills"],
verbose=False
)
result = get_skills_resumes_chain({"resumes": resumes})
print(result)
resume_skills = json.loads(result['resume_skills'])
if __name__ == "__main__":
resumes = get_resume_string()
for x in resumes:
get_skills(x)