DrishtiSharma commited on
Commit
dd49a77
·
verified ·
1 Parent(s): 2045140

Create incorrect.py

Browse files
Files changed (1) hide show
  1. interim/incorrect.py +149 -0
interim/incorrect.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import pandas as pd
4
+ from datasets import load_dataset
5
+ import time
6
+ from langchain.agents.agent_types import AgentType
7
+ from langchain_experimental.agents.agent_toolkits import create_csv_agent
8
+ from langchain_openai import ChatOpenAI
9
+ import ast
10
+
11
+ # Streamlit App Title and Description
12
+ st.title("Patent Data Analysis with LangChain")
13
+ st.write("""This app allows you to analyze patent-related datasets interactively using LangChain agents. You can upload datasets, load from Hugging Face, or use a repository directory dataset.""")
14
+
15
+ # Dataset loading without caching to support progress bar
16
+ def load_huggingface_dataset(dataset_name):
17
+ # Initialize progress bar
18
+ progress_bar = st.progress(0)
19
+ try:
20
+ # Incrementally update progress
21
+ progress_bar.progress(10)
22
+ dataset = load_dataset(dataset_name, name="sample", split="train", trust_remote_code=True, uniform_split=True)
23
+ progress_bar.progress(50)
24
+ if hasattr(dataset, "to_pandas"):
25
+ df = dataset.to_pandas()
26
+ else:
27
+ df = pd.DataFrame(dataset)
28
+ progress_bar.progress(100) # Final update to 100%
29
+ return df
30
+ except Exception as e:
31
+ progress_bar.progress(0) # Reset progress bar on failure
32
+ raise e
33
+
34
+ def load_uploaded_csv(uploaded_file):
35
+ # Initialize progress bar
36
+ progress_bar = st.progress(0)
37
+ try:
38
+ # Simulate progress
39
+ progress_bar.progress(10)
40
+ time.sleep(1) # Simulate file processing delay
41
+ progress_bar.progress(50)
42
+ df = pd.read_csv(uploaded_file)
43
+ progress_bar.progress(100) # Final update
44
+ return df
45
+ except Exception as e:
46
+ progress_bar.progress(0) # Reset progress bar on failure
47
+ raise e
48
+
49
+ # Dataset selection logic
50
+ def load_dataset_into_session():
51
+ input_option = st.radio(
52
+ "Select Dataset Input:",
53
+ ["Use Repo Directory Dataset", "Use Hugging Face Dataset", "Upload CSV File"], index=1, horizontal=True
54
+ )
55
+
56
+ # Option 1: Load dataset from the repo directory
57
+ if input_option == "Use Repo Directory Dataset":
58
+ file_path = "./source/test.csv"
59
+ if st.button("Load Dataset"):
60
+ try:
61
+ with st.spinner("Loading dataset from the repo directory..."):
62
+ st.session_state.df = pd.read_csv(file_path)
63
+ st.success(f"File loaded successfully from '{file_path}'!")
64
+ except Exception as e:
65
+ st.error(f"Error loading dataset from the repo directory: {e}")
66
+
67
+ # Option 2: Load dataset from Hugging Face
68
+ elif input_option == "Use Hugging Face Dataset":
69
+ dataset_name = st.text_input(
70
+ "Enter Hugging Face Dataset Name:", value="HUPD/hupd"
71
+ )
72
+ if st.button("Load Dataset"):
73
+ try:
74
+ st.session_state.df = load_huggingface_dataset(dataset_name)
75
+ st.success(f"Hugging Face Dataset '{dataset_name}' loaded successfully!")
76
+ except Exception as e:
77
+ st.error(f"Error loading Hugging Face dataset: {e}")
78
+
79
+ # Option 3: Upload CSV File
80
+ elif input_option == "Upload CSV File":
81
+ uploaded_file = st.file_uploader("Upload a CSV File:", type=["csv"])
82
+ if uploaded_file:
83
+ try:
84
+ st.session_state.df = load_uploaded_csv(uploaded_file)
85
+ st.success("File uploaded successfully!")
86
+ except Exception as e:
87
+ st.error(f"Error reading uploaded file: {e}")
88
+
89
+ # Load dataset into session
90
+ load_dataset_into_session()
91
+
92
+ if "df" in st.session_state:
93
+ df = st.session_state.df
94
+
95
+ # Display dataset metadata
96
+ st.write("### Dataset Metadata")
97
+ st.text(f"Number of Rows: {df.shape[0]}")
98
+ st.text(f"Number of Columns: {df.shape[1]}")
99
+ st.text(f"Column Names: {', '.join(df.columns)}")
100
+
101
+ # Display dataset preview
102
+ st.write("### Dataset Preview")
103
+ num_rows = st.slider("Select number of rows to display:", min_value=5, max_value=50, value=10)
104
+ st.dataframe(df.head(num_rows))
105
+
106
+ # Define LangChain CSV Agent
107
+ st.header("Run Queries on Patent Data")
108
+
109
+ with st.spinner("Setting up LangChain CSV Agent..."):
110
+ df.to_csv("patent_data.csv", index=False)
111
+
112
+ csv_agent = create_csv_agent(
113
+ ChatOpenAI(temperature=0, model="gpt-4", api_key=os.getenv("OPENAI_API_KEY")),
114
+ path=["patent_data.csv"],
115
+ verbose=True,
116
+ agent_type=AgentType.OPENAI_FUNCTIONS,
117
+ allow_dangerous_code=True
118
+ )
119
+
120
+ # Query Input and Execution
121
+ query = st.text_area("Enter your natural language query:", "How many patents are related to AI?")
122
+
123
+ if st.button("Run Query"):
124
+ with st.spinner("Running query..."):
125
+ try:
126
+ # Split query execution into smaller chunks if needed
127
+ max_rows = 1000
128
+ total_rows = len(df)
129
+ results = []
130
+
131
+ for start in range(0, total_rows, max_rows):
132
+ chunk = df.iloc[start:start + max_rows]
133
+ chunk.to_csv("chunk_data.csv", index=False)
134
+ partial_agent = create_csv_agent(
135
+ ChatOpenAI(temperature=0, model="gpt-4", api_key=os.getenv("OPENAI_API_KEY")),
136
+ path=["chunk_data.csv"],
137
+ verbose=True,
138
+ agent_type=AgentType.OPENAI_FUNCTIONS,
139
+ allow_dangerous_code=True
140
+ )
141
+ result = partial_agent.invoke(query)
142
+ results.append(result)
143
+
144
+ st.success("Query executed successfully!")
145
+ st.write("### Query Result:")
146
+ st.write("\n".join(results))
147
+
148
+ except Exception as e:
149
+ st.error(f"Error executing query: {e}")