File size: 6,507 Bytes
2045140
f84b0ed
cfbd391
 
 
 
119dc78
 
cfbd391
 
 
 
 
 
 
 
 
 
 
 
 
 
a83f9d8
cfbd391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f84b0ed
 
 
 
75e8da0
f84b0ed
 
 
 
 
cfbd391
 
 
 
 
 
 
f84b0ed
76ad1e1
f57eb57
 
 
 
 
 
f84b0ed
 
 
 
 
75e8da0
f84b0ed
 
 
0d4e1b9
f84b0ed
 
 
f57eb57
 
 
 
 
 
 
 
 
 
 
cfbd391
f57eb57
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os
import tempfile
import streamlit as st
import pandas as pd
from datasets import load_dataset
import time
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_openai import ChatOpenAI
import ast

# Streamlit App Title and Description
st.title("Patent Data Analysis with LangChain")
st.write("""This app allows you to analyze patent-related datasets interactively using LangChain agents. You can upload datasets, load from Hugging Face, or use a repository directory dataset.""")

# Dataset loading without caching to support progress bar
def load_huggingface_dataset(dataset_name):
    # Initialize progress bar
    progress_bar = st.progress(0)
    try:
        # Incrementally update progress
        progress_bar.progress(10)
        dataset = load_dataset(dataset_name, name="sample", split="train", trust_remote_code=True, uniform_split=True)
        progress_bar.progress(50)
        if hasattr(dataset, "to_pandas"):
            df = dataset.to_pandas()
        else:
            df = pd.DataFrame(dataset)
        progress_bar.progress(100)  # Final update to 100%
        return df
    except Exception as e:
        progress_bar.progress(0)  # Reset progress bar on failure
        raise e

def load_uploaded_csv(uploaded_file):
    # Initialize progress bar
    progress_bar = st.progress(0)
    try:
        # Simulate progress
        progress_bar.progress(10)
        time.sleep(1)  # Simulate file processing delay
        progress_bar.progress(50)
        df = pd.read_csv(uploaded_file)
        progress_bar.progress(100)  # Final update
        return df
    except Exception as e:
        progress_bar.progress(0)  # Reset progress bar on failure
        raise e

# Dataset selection logic
def load_dataset_into_session():
    input_option = st.radio(
        "Select Dataset Input:",
        ["Use Repo Directory Dataset", "Use Hugging Face Dataset", "Upload CSV File"], index=1, horizontal=True
    )

    # Option 1: Load dataset from the repo directory
    if input_option == "Use Repo Directory Dataset":
        file_path = "./source/test.csv"
        if st.button("Load Dataset"):
            try:
                with st.spinner("Loading dataset from the repo directory..."):
                    st.session_state.df = pd.read_csv(file_path)
                st.success(f"File loaded successfully from '{file_path}'!")
            except Exception as e:
                st.error(f"Error loading dataset from the repo directory: {e}")

    # Option 2: Load dataset from Hugging Face
    elif input_option == "Use Hugging Face Dataset":
        dataset_name = st.text_input(
            "Enter Hugging Face Dataset Name:", value="HUPD/hupd"
        )
        if st.button("Load Dataset"):
            try:
                st.session_state.df = load_huggingface_dataset(dataset_name)
                st.success(f"Hugging Face Dataset '{dataset_name}' loaded successfully!")
            except Exception as e:
                st.error(f"Error loading Hugging Face dataset: {e}")

    # Option 3: Upload CSV File
    elif input_option == "Upload CSV File":
        uploaded_file = st.file_uploader("Upload a CSV File:", type=["csv"])
        if uploaded_file:
            try:
                st.session_state.df = load_uploaded_csv(uploaded_file)
                st.success("File uploaded successfully!")
            except Exception as e:
                st.error(f"Error reading uploaded file: {e}")

# Load dataset into session
load_dataset_into_session()

if "df" in st.session_state:
    df = st.session_state.df

    # Display dataset metadata
    st.write("### Dataset Metadata")
    st.text(f"Number of Rows: {df.shape[0]}")
    st.text(f"Number of Columns: {df.shape[1]}")
    st.text(f"Column Names: {', '.join(df.columns)}")

    # Display dataset preview
    st.write("### Dataset Preview")
    num_rows = st.slider("Select number of rows to display:", min_value=5, max_value=50, value=10)
    st.dataframe(df.head(num_rows))

    # Define LangChain CSV Agent
    st.header("Run Queries on Patent Data")

    with st.spinner("Setting up LangChain CSV Agent..."):
        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
            df.to_csv(temp_file.name, index=False)

            csv_agent = create_csv_agent(
                ChatOpenAI(temperature=0, model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")),
                path=[temp_file.name],
                verbose=True,
                agent_type=AgentType.OPENAI_FUNCTIONS,
                allow_dangerous_code=True
            )

    # Query Input and Execution
    query = st.text_area("Enter your natural language query:", "How many patents are related to AI?")

    if st.button("Run Query"):
        with st.spinner("Running query..."):
            try:
                # Token limit configuration
                max_rows = 200  # Adjust chunk size dynamically
                total_rows = len(df)

                if total_rows > max_rows:
                    results = []
                    for start in range(0, total_rows, max_rows):
                        chunk = df.iloc[start:start + max_rows]
                        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as chunk_file:
                            chunk.to_csv(chunk_file.name, index=False)

                            # Update the agent dynamically with the chunk
                            csv_agent = create_csv_agent(
                                ChatOpenAI(temperature=0, model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")),
                                path=[chunk_file.name],
                                verbose=False,
                                agent_type=AgentType.OPENAI_FUNCTIONS,
                                allow_dangerous_code=True
                            )
                            result = csv_agent.invoke(query)
                            results.append(result)

                    st.success("Query executed successfully!")
                    st.write("### Combined Query Results:")
                    st.write("\n".join(results))

                else:
                    result = csv_agent.invoke(query)
                    st.success("Query executed successfully!")
                    st.write("### Query Result:")
                    st.write(result)

            except Exception as e:
                st.error(f"Error executing query: {e}")