import boto3 import os import json from langchain.chat_models import ChatOpenAI from langchain.prompts import ChatPromptTemplate from langchain.chains import LLMChain, SequentialChain llm = ChatOpenAI(temperature=0.0, openai_api_key=os.environ["OPENAI"]) def get_resume_string() -> str: s3 = boto3.client( 's3', region_name='eu-west-1' ) resumes = s3.get_object(Bucket='ausy-datalake-drift-nonprod', Key='resume-matcher/raw/resume-dataset.csv') resumes_list = resumes['Body'].read().decode('utf-8').splitlines() resumes_list = [s.replace('. ', '.\n') for s in resumes_list] resumes_list = [s.replace('•', '\n - ') for s in resumes_list] # resume_string =''.join(resumes_list) return resumes_list def get_skills(resumes: str) -> list: template_resumes_get_skills = """ Given the following string, delimited by and which contains resumes which are not properly formatted, categorize the resumes based on domain. For each domain list the skills of the resumes that are part of that domain. Create a JSON object where they keys are the domains and the values are a list containing the skills. Return that JSON object only. {resumes} """ prompt_vacancy_get_skills = ChatPromptTemplate.from_template(template=template_resumes_get_skills) resume_skills = LLMChain(llm=llm, prompt=prompt_vacancy_get_skills, output_key="resume_skills") get_skills_resumes_chain = SequentialChain( chains=[resume_skills], input_variables=["resumes"], output_variables=["resume_skills"], verbose=False ) result = get_skills_resumes_chain({"resumes": resumes}) print(result) resume_skills = json.loads(result['resume_skills']) if __name__ == "__main__": resumes = get_resume_string() for x in resumes: get_skills(x)