Update app.py
Browse files
app.py
CHANGED
@@ -1,207 +1,201 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import gradio as gr
|
3 |
-
import numpy as np
|
4 |
-
import whisper
|
5 |
-
import os
|
6 |
-
import streamlit.components.v1 as components
|
7 |
-
import tempfile
|
8 |
-
import io
|
9 |
-
import requests
|
10 |
-
import json
|
11 |
-
import openai
|
12 |
-
from transformers import AutoConfig, AutoTokenizer, AutoModel
|
13 |
-
from summarizer import Summarizer
|
14 |
-
|
15 |
-
# File upload size bug?
|
16 |
-
|
17 |
-
# st.set_option('server.maxUploadSize', 500)
|
18 |
-
|
19 |
-
# Initialize session state for claims_extraction
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
# Initialize
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
audio_file
|
65 |
-
|
66 |
-
audio_data =
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
#
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
audio_data
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
with
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
#
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
#
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
)
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
model_name =
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
#
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
#
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
#
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
#
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
#
|
201 |
-
|
202 |
-
st.text("BERT Summaries:")
|
203 |
-
for i, summary in enumerate(summaries):
|
204 |
-
st.text(f"Summary {i + 1}:\n{summary}")
|
205 |
-
|
206 |
-
# Citation for the GitHub repo
|
207 |
-
st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & (Taylor)[https://huggingface.co/Cloudfaith] [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True)
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import gradio as gr
|
3 |
+
import numpy as np
|
4 |
+
import whisper
|
5 |
+
import os
|
6 |
+
import streamlit.components.v1 as components
|
7 |
+
import tempfile
|
8 |
+
import io
|
9 |
+
import requests
|
10 |
+
import json
|
11 |
+
import openai
|
12 |
+
from transformers import AutoConfig, AutoTokenizer, AutoModel
|
13 |
+
from summarizer import Summarizer
|
14 |
+
|
15 |
+
# File upload size bug?
|
16 |
+
|
17 |
+
# st.set_option('server.maxUploadSize', 500)
|
18 |
+
|
19 |
+
# Initialize session state for claims_extraction
|
20 |
+
st.session_state.claims_extraction = ""
|
21 |
+
|
22 |
+
# Initialize session state for userinput
|
23 |
+
st.session_state.userinput = "" # Initialize user input
|
24 |
+
|
25 |
+
# Define a function to split text into chunks
|
26 |
+
def chunk_text(text, chunk_size=2000):
|
27 |
+
chunks = []
|
28 |
+
start = 0
|
29 |
+
while start < len(text):
|
30 |
+
end = start + chunk_size
|
31 |
+
chunk = text[start:end]
|
32 |
+
chunks.append(chunk)
|
33 |
+
start = end
|
34 |
+
return chunks
|
35 |
+
|
36 |
+
# Streamlit Session State
|
37 |
+
if 'learning_objectives' not in st.session_state:
|
38 |
+
st.session_state.learning_objectives = ""
|
39 |
+
|
40 |
+
# Initialize the Whisper model outside the button
|
41 |
+
if 'whisper_model' not in st.session_state:
|
42 |
+
st.session_state.whisper_model = whisper.load_model("base")
|
43 |
+
|
44 |
+
# Streamlit Interface
|
45 |
+
|
46 |
+
markdown_text = """
|
47 |
+
# 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor.
|
48 |
+
Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased).
|
49 |
+
- Save time and effort when ideating for your future business.
|
50 |
+
"""
|
51 |
+
|
52 |
+
# Render the Markdown content
|
53 |
+
st.markdown(markdown_text)
|
54 |
+
|
55 |
+
# API Key Input
|
56 |
+
api_key = st.text_input("Enter your OpenAI API Key:", type="password")
|
57 |
+
|
58 |
+
# Audio Upload
|
59 |
+
st.write("Upload an audio file (supported formats: mp3, wav, ogg)")
|
60 |
+
audio_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"], key="audio_file")
|
61 |
+
|
62 |
+
audio_data = None
|
63 |
+
|
64 |
+
if audio_file is not None:
|
65 |
+
audio_data = audio_file.read()
|
66 |
+
st.audio(audio_data, format="audio/wav")
|
67 |
+
st.info("Transcribing...")
|
68 |
+
st.success("Transcription complete")
|
69 |
+
|
70 |
+
# Moved the submit_button check here
|
71 |
+
if st.button('Start Transcription'):
|
72 |
+
model = st.session_state.whisper_model
|
73 |
+
|
74 |
+
if audio_data:
|
75 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file:
|
76 |
+
audio_file.write(audio_data)
|
77 |
+
audio_file_path = audio_file.name
|
78 |
+
st.audio(audio_file_path, format="audio/wav")
|
79 |
+
st.info("Transcribing...")
|
80 |
+
st.success("Transcription complete")
|
81 |
+
result = model.transcribe(audio_file_path)
|
82 |
+
transcript = result['text'] # Define the 'transcript' variable
|
83 |
+
|
84 |
+
with st.expander("See transcript"):
|
85 |
+
st.markdown(transcript)
|
86 |
+
|
87 |
+
# Display the Whisper transcription
|
88 |
+
if 'transcript' in locals():
|
89 |
+
st.text("Transcription:")
|
90 |
+
st.text(transcript)
|
91 |
+
|
92 |
+
# Update the user input field with the transcription
|
93 |
+
st.session_state.userinput = st.text_area("Input Text:", transcript)
|
94 |
+
|
95 |
+
# Model Selection Dropdown
|
96 |
+
model_choice = st.selectbox(
|
97 |
+
"Select the model you want to use:",
|
98 |
+
["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"]
|
99 |
+
)
|
100 |
+
|
101 |
+
# Context, Subject, and Level
|
102 |
+
context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable."
|
103 |
+
|
104 |
+
# Initialize OpenAI API
|
105 |
+
if api_key:
|
106 |
+
openai.api_key = api_key
|
107 |
+
|
108 |
+
# Learning Objectives
|
109 |
+
st.write("### Patentable Claims:")
|
110 |
+
|
111 |
+
# Initialize autogenerated objectives
|
112 |
+
claims_extraction = ""
|
113 |
+
|
114 |
+
# Initialize status placeholder
|
115 |
+
learning_status_placeholder = st.empty()
|
116 |
+
|
117 |
+
disable_button_bool = False
|
118 |
+
|
119 |
+
if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool):
|
120 |
+
# Split the user input into chunks
|
121 |
+
input_chunks = chunk_text(userinput)
|
122 |
+
|
123 |
+
# Initialize a variable to store the extracted claims
|
124 |
+
all_extracted_claims = ""
|
125 |
+
|
126 |
+
for chunk in input_chunks:
|
127 |
+
# Display status message for the current chunk
|
128 |
+
learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...")
|
129 |
+
|
130 |
+
# API call to generate objectives for the current chunk
|
131 |
+
claims_extraction_response = openai.ChatCompletion.create(
|
132 |
+
model=model_choice,
|
133 |
+
messages=[
|
134 |
+
{"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."}
|
135 |
+
]
|
136 |
+
)
|
137 |
+
|
138 |
+
# Extract the generated objectives from the API response
|
139 |
+
claims_extraction = claims_extraction_response['choices'][0]['message']['content']
|
140 |
+
|
141 |
+
# Append the extracted claims from the current chunk to the overall results
|
142 |
+
all_extracted_claims += claims_extraction.strip()
|
143 |
+
|
144 |
+
# Save the generated objectives to session state
|
145 |
+
st.session_state.claims_extraction = all_extracted_claims
|
146 |
+
|
147 |
+
# Display generated objectives for all chunks
|
148 |
+
learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}")
|
149 |
+
|
150 |
+
# Get the extracted claims from Streamlit's session state
|
151 |
+
claims_extracted = st.session_state.claims_extraction
|
152 |
+
|
153 |
+
# Display the Extracted Claims
|
154 |
+
if 'claims_extracted' in st.session_state:
|
155 |
+
st.text("Extracted Claims:")
|
156 |
+
st.text(st.session_state.claims_extracted)
|
157 |
+
|
158 |
+
# Define the BERT-based model name
|
159 |
+
model_name = 'nlpaueb/legal-bert-base-uncased'
|
160 |
+
|
161 |
+
# Initialize BERT-based model and tokenizer
|
162 |
+
custom_config = AutoConfig.from_pretrained(model_name)
|
163 |
+
custom_config.output_hidden_states = True
|
164 |
+
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
165 |
+
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
|
166 |
+
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
|
167 |
+
print('Using model {}\n'.format(model_name))
|
168 |
+
|
169 |
+
# Get the extracted claims from Streamlit's session state
|
170 |
+
# claims_extracted = st.session_state.claims_extraction #moved up
|
171 |
+
|
172 |
+
# Define the chunk size
|
173 |
+
chunk_size = 350
|
174 |
+
|
175 |
+
# Split the extracted claims into chunks
|
176 |
+
if isinstance(claims_extracted, str):
|
177 |
+
chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)]
|
178 |
+
else:
|
179 |
+
chunks = []
|
180 |
+
|
181 |
+
# Process each chunk with the BERT-based model
|
182 |
+
summaries = []
|
183 |
+
for chunk in chunks:
|
184 |
+
summary = bert_legal_model(chunk, min_length=8, ratio=0.05)
|
185 |
+
summaries.append(summary)
|
186 |
+
|
187 |
+
# Now you have a list of summaries for each chunk
|
188 |
+
# You can access them using `summaries[0]`, `summaries[1]`, etc.
|
189 |
+
# After generating summaries
|
190 |
+
for i, summary in enumerate(summaries):
|
191 |
+
st.write(f"### Summary {i+1}")
|
192 |
+
st.write(summary)
|
193 |
+
|
194 |
+
# Display the BERT Summaries
|
195 |
+
if summaries:
|
196 |
+
st.text("BERT Summaries:")
|
197 |
+
for i, summary in enumerate(summaries):
|
198 |
+
st.text(f"Summary {i + 1}:\n{summary}")
|
199 |
+
|
200 |
+
# Citation for the GitHub repo
|
201 |
+
st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & (Taylor)[https://huggingface.co/Cloudfaith] [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|