#Parent directory import sys import os import time import pandas as pd # for data manipulation (pip install pandas) import matplotlib.pyplot as plt from random import randint from urllib.parse import urlparse import numpy as np from langchain.chat_models import ChatOpenAI from langchain.chains import create_extraction_chain from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from typing import Optional from langchain.chains.openai_functions import ( create_structured_output_chain, create_tagging_chain_pydantic ) from langchain.prompts import ChatPromptTemplate import gradio as gr from collections import defaultdict # Schema schema = { "properties": { "keyword": {"type": "string"}, "category": {"type": "string"}, }, "required": ["keyword", "category"], } # Input prompt = ChatPromptTemplate.from_messages( [ ("system", "You are an expert marketing researcher specialized in the finance industry"), ("human", """{prompt_input}. Here you have the categories splitted by coma: {categories}. and Here you have the keywords splitted by coma: {keywords}."""), ("human", "Tip: Make sure to answer in the correct format and DO NOT leave keywords without category and DO NOT skip keywords. Please categorize all the keywords that I give you, each keyword must have just one and only one category."), ] ) llm = ChatOpenAI(temperature=0, openai_api_key=os.environ['OpenAI_APIKEY'], model="gpt-3.5-turbo") chain = create_extraction_chain(schema, llm, prompt, verbose=1) def run_chain(input_prompt, keywords_file, categories_file, batch_size=50): results = [] batch_size = batch_size index = 0 try: keywords = pd.read_csv(keywords_file.name) except: keywords = pd.read_excel(keywords_file.name) try: categories = pd.read_csv(categories_file.name) except: categories = pd.read_excel(categories_file.name) keywords = list(keywords[keywords.columns[0]].values) categories = list(categories[categories.columns[0]].values) while index < len(keywords): try: batch = keywords[index:index+batch_size] except: batch = keywords[index:] try: result = chain.run({'prompt_input':input_prompt, 'categories':','.join(categories), 'keywords':','.join(batch)}) except Exception as E: print('this batch did not worked from {} to {}'.format(index, index + batch_size)) print(E) result = [] results += result index += batch_size results_to_csv(results) #print((index, batch_size, len(keywords))) return results, 'themes_results.csv' def results_to_csv(results): super_dict = defaultdict(list) for d in results: for k, v in d.items(): # d.items() in Python 3+ super_dict[k].append(v) pd.DataFrame(super_dict).to_csv('themes_results.csv', index=False) with gr.Blocks() as demo: prompt_input = gr.Text("""I need your help to analyze and categorize the provided list of keywords into the appropriate categories. The goal is to understand information demand on search engines within this industry. Each keyword represents a search and it should have a relation with the category. Extract each keyword and assign the best category among the given categories. Return every keyword with the relative category in pairs.""") gr.Markdown("Upload CSV or xlsx with keywords: Just a csv with all the keywords in one column. Should have a header") keywords_file = gr.File(file_types=['csv', 'xlsx'], label='keywords') gr.Markdown("Upload CSV or xlsx with categories: Just a csv with all the keywords in one column. Should have a header") categories_file = gr.File(file_types=['.csv', '.xlsx'], label='categories') with gr.Accordion("Open for More!"): gr.Markdown("Look at me...") btn = gr.Button(value="run") txt_3 = gr.Textbox(value="", label="Output") output_file = gr.File(label="Output File", file_count="single", file_types=["", ".", ".csv",".xls",".xlsx"]) btn.click(run_chain, inputs=[prompt_input, keywords_file, categories_file], outputs=[txt_3, output_file]) demo.launch()