#Parent directory 
import sys    
import os
import time 
import pandas as pd    # for data manipulation (pip install pandas)
import matplotlib.pyplot as plt
from random import randint
from urllib.parse import urlparse
import numpy as np
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from typing import Optional
from langchain.chains.openai_functions import (
    create_structured_output_chain, create_tagging_chain_pydantic
)
from langchain.prompts import ChatPromptTemplate
import gradio as gr
from collections import defaultdict

# Schema
schema = {
    "properties": {
        "keyword": {"type": "string"},
        "category": {"type": "string"},
    },
    "required": ["keyword", "category"],
}

# Input 
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert marketing researcher specialized in the finance industry"),
        ("human", """{prompt_input}.
         Here you have the categories splitted by coma: {categories}.
         and Here you have the keywords splitted by coma: {keywords}."""),
        ("human", "Tip: Make sure to answer in the correct format and DO NOT leave keywords without category and DO NOT skip keywords. Please categorize all the keywords that I give you, each keyword must have just one and only one category."),
    ]
)

llm = ChatOpenAI(temperature=0, openai_api_key=os.environ['OpenAI_APIKEY'], model="gpt-3.5-turbo")
chain = create_extraction_chain(schema, llm, prompt, verbose=1)

def run_chain(input_prompt, keywords_file, categories_file, batch_size=50):
    results = []
    batch_size = batch_size
    index = 0
    try:
        keywords = pd.read_csv(keywords_file.name)
    except:
        keywords = pd.read_excel(keywords_file.name)
    try:
        categories = pd.read_csv(categories_file.name)
    except:
        categories = pd.read_excel(categories_file.name)
    keywords = list(keywords[keywords.columns[0]].values)
    categories = list(categories[categories.columns[0]].values)
    while index < len(keywords):
        try:
            batch = keywords[index:index+batch_size]
        except:
            batch = keywords[index:]
        try:
            result = chain.run({'prompt_input':input_prompt, 'categories':','.join(categories), 'keywords':','.join(batch)})
        except Exception as E:
            print('this batch did not worked from {} to {}'.format(index, index + batch_size))
            print(E)
            result = []
        results += result
        index += batch_size
        results_to_csv(results)
        #print((index, batch_size, len(keywords)))
    return results, 'themes_results.csv'

def results_to_csv(results):
    super_dict = defaultdict(list)
    for d in results:
        for k, v in d.items():  # d.items() in Python 3+
            super_dict[k].append(v)
    pd.DataFrame(super_dict).to_csv('themes_results.csv', index=False)


with gr.Blocks() as demo:
    prompt_input = gr.Text("""I need your help to analyze and categorize the provided list of keywords
into the appropriate categories. 
The goal is to understand information demand on search engines within this industry. Each keyword represents a search and it should have a relation with the category. 
Extract each keyword and assign the best category among the given categories. Return every keyword with the relative category in pairs.""")
    gr.Markdown("Upload CSV or xlsx with keywords: Just a csv  with all the keywords in one column. Should have a header")
    keywords_file = gr.File(file_types=['csv', 'xlsx'], label='keywords')
    gr.Markdown("Upload CSV or xlsx with categories: Just a csv with all the keywords in one column. Should have a header")
    categories_file = gr.File(file_types=['.csv', '.xlsx'], label='categories')
    with gr.Accordion("Open for More!"):
        gr.Markdown("Look at me...")

    btn = gr.Button(value="run")
    txt_3 = gr.Textbox(value="", label="Output")
    output_file = gr.File(label="Output File", 
                file_count="single", 
                file_types=["", ".", ".csv",".xls",".xlsx"])
    
    btn.click(run_chain, inputs=[prompt_input, keywords_file, categories_file], outputs=[txt_3, output_file])

demo.launch()