File size: 5,026 Bytes
683edfb
cfbd391
 
 
 
119dc78
 
cfbd391
 
 
 
 
 
 
 
 
 
 
 
 
 
1f79641
cfbd391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683edfb
cfbd391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import streamlit as st
import pandas as pd
from datasets import load_dataset
import time
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_openai import ChatOpenAI
import ast

# Streamlit App Title and Description
st.title("Patent Data Analysis with LangChain")
st.write("""This app allows you to analyze patent-related datasets interactively using LangChain agents. You can upload datasets, load from Hugging Face, or use a repository directory dataset.""")

# Dataset loading without caching to support progress bar
def load_huggingface_dataset(dataset_name):
    # Initialize progress bar
    progress_bar = st.progress(0)
    try:
        # Incrementally update progress
        progress_bar.progress(10)
        dataset = load_dataset(dataset_name, name="sample", split="train", trust_remote_code=True, uniform_split=True)
        progress_bar.progress(50)
        if hasattr(dataset, "to_pandas"):
            df = dataset.to_pandas()
        else:
            df = pd.DataFrame(dataset)
        progress_bar.progress(100)  # Final update to 100%
        return df
    except Exception as e:
        progress_bar.progress(0)  # Reset progress bar on failure
        raise e

def load_uploaded_csv(uploaded_file):
    # Initialize progress bar
    progress_bar = st.progress(0)
    try:
        # Simulate progress
        progress_bar.progress(10)
        time.sleep(1)  # Simulate file processing delay
        progress_bar.progress(50)
        df = pd.read_csv(uploaded_file)
        progress_bar.progress(100)  # Final update
        return df
    except Exception as e:
        progress_bar.progress(0)  # Reset progress bar on failure
        raise e

# Dataset selection logic
def load_dataset_into_session():
    input_option = st.radio(
        "Select Dataset Input:",
        ["Use Repo Directory Dataset", "Use Hugging Face Dataset", "Upload CSV File"], index=1, horizontal=True
    )

    # Option 1: Load dataset from the repo directory
    if input_option == "Use Repo Directory Dataset":
        file_path = "./source/test.csv"
        if st.button("Load Dataset"):
            try:
                with st.spinner("Loading dataset from the repo directory..."):
                    st.session_state.df = pd.read_csv(file_path)
                st.success(f"File loaded successfully from '{file_path}'!")
            except Exception as e:
                st.error(f"Error loading dataset from the repo directory: {e}")

    # Option 2: Load dataset from Hugging Face
    elif input_option == "Use Hugging Face Dataset":
        dataset_name = st.text_input(
            "Enter Hugging Face Dataset Name:", value="HUPD/hupd"
        )
        if st.button("Load Dataset"):
            try:
                st.session_state.df = load_huggingface_dataset(dataset_name)
                st.success(f"Hugging Face Dataset '{dataset_name}' loaded successfully!")
            except Exception as e:
                st.error(f"Error loading Hugging Face dataset: {e}")

    # Option 3: Upload CSV File
    elif input_option == "Upload CSV File":
        uploaded_file = st.file_uploader("Upload a CSV File:", type=["csv"])
        if uploaded_file:
            try:
                st.session_state.df = load_uploaded_csv(uploaded_file)
                st.success("File uploaded successfully!")
            except Exception as e:
                st.error(f"Error reading uploaded file: {e}")

# Load dataset into session
load_dataset_into_session()

if "df" in st.session_state:
    df = st.session_state.df

    # Display dataset metadata
    st.write("### Dataset Metadata")
    st.text(f"Number of Rows: {df.shape[0]}")
    st.text(f"Number of Columns: {df.shape[1]}")
    st.text(f"Column Names: {', '.join(df.columns)}")

    # Display dataset preview
    st.write("### Dataset Preview")
    num_rows = st.slider("Select number of rows to display:", min_value=5, max_value=50, value=10)
    st.dataframe(df.head(num_rows))

    # Define LangChain CSV Agent
    st.header("Run Queries on Patent Data")

    with st.spinner("Setting up LangChain CSV Agent..."):
        df.to_csv("patent_data.csv", index=False)

        csv_agent = create_csv_agent(
            ChatOpenAI(temperature=0, model="gpt-4", api_key=os.getenv("OPENAI_API_KEY")),
            path=["patent_data.csv"],
            verbose=True,
            agent_type=AgentType.OPENAI_FUNCTIONS,
            allow_dangerous_code=True
        )

    # Query Input and Execution
    query = st.text_area("Enter your natural language query:", "How many patents are related to AI?")

    if st.button("Run Query"):
        with st.spinner("Running query..."):
            try:
                result = csv_agent.invoke(query)
                st.success("Query executed successfully!")
                st.write("### Query Result:")
                st.write(result)
            except Exception as e:
                st.error(f"Error executing query: {e}")