Tonic commited on
Commit
ff81e3f
·
1 Parent(s): 43fab1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -207
app.py CHANGED
@@ -1,207 +1,201 @@
1
- import streamlit as st
2
- import gradio as gr
3
- import numpy as np
4
- import whisper
5
- import os
6
- import streamlit.components.v1 as components
7
- import tempfile
8
- import io
9
- import requests
10
- import json
11
- import openai
12
- from transformers import AutoConfig, AutoTokenizer, AutoModel
13
- from summarizer import Summarizer
14
-
15
- # File upload size bug?
16
-
17
- # st.set_option('server.maxUploadSize', 500)
18
-
19
- # Initialize session state for claims_extraction
20
- # if 'claims_extraction' not in st.session_state:
21
- st.session_state.claims_extraction = ""
22
-
23
- # Initialize session state for userinput
24
- # if 'userinput' not in st.session_state:
25
- st.session_state.userinput = "" # Initialize user input
26
-
27
- # Define a function to split text into chunks
28
- def chunk_text(text, chunk_size=2000):
29
- chunks = []
30
- start = 0
31
- while start < len(text):
32
- end = start + chunk_size
33
- chunk = text[start:end]
34
- chunks.append(chunk)
35
- start = end
36
- return chunks
37
-
38
- # Streamlit Session State
39
- if 'learning_objectives' not in st.session_state:
40
- st.session_state.learning_objectives = ""
41
-
42
- # Initialize the Whisper model outside the button
43
- if 'whisper_model' not in st.session_state:
44
- st.session_state.whisper_model = whisper.load_model("base")
45
-
46
- #StreamlitInterface
47
-
48
- markdown_text = """
49
- # 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor.
50
-
51
- Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased).
52
-
53
- - Save time and effort when ideating for your future business.
54
-
55
- """
56
-
57
- # Render the Markdown content
58
- st.markdown(markdown_text)
59
- # API Key Input
60
- api_key = st.text_input("Enter your OpenAI API Key:", type="password")
61
-
62
- # Audio Upload
63
- st.write("Upload an audio file (supported formats: mp3, wav, ogg)")
64
- audio_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"], key="audio_file")
65
-
66
- audio_data = None
67
-
68
- if audio_file is not None:
69
- # Check the file size here and limit it to your desired size
70
- # max_file_size = 0.1 # Specify the maximum file size in MB
71
- # if len(audio_file.read()) / (1024 * 1024) > max_file_size:
72
- # st.error(f"File size exceeds the maximum allowed size of {max_file_size} MB.")
73
- # else:
74
- audio_data = audio_file.read()
75
- st.audio(audio_data, format="audio/wav")
76
- st.info("Transcribing...")
77
- st.success("Transcription complete")
78
-
79
- # Moved the submit_button check here
80
- if st.button('Start Transcription'):
81
- model = st.session_state.whisper_model
82
-
83
- if audio_data:
84
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file:
85
- audio_file.write(audio_data)
86
- audio_file_path = audio_file.name
87
- st.audio(audio_file_path, format="audio/wav")
88
- st.info("Transcribing...")
89
- st.success("Transcription complete")
90
- result = model.transcribe(audio_file_path)
91
- transcript = result['text'] # Define the 'transcript' variable
92
-
93
- with st.expander("See transcript"):
94
- st.markdown(transcript)
95
- # Display the Whisper transcription
96
- if 'transcript' in locals():
97
- st.text("Transcription:")
98
- st.text(transcript)
99
-
100
- # Update the user input field with the transcription
101
- st.session_state.userinput = st.text_area("Input Text:", transcript)
102
-
103
- # Model Selection Dropdown
104
- model_choice = st.selectbox(
105
- "Select the model you want to use:",
106
- ["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"]
107
- )
108
-
109
- # Context, Subject, and Level
110
- context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable."
111
- # userinput = st.text_input("Input Text:", "Freeform text here!") # Commented out, as it's updated above
112
-
113
- # Initialize OpenAI API
114
- if api_key:
115
- openai.api_key = api_key
116
-
117
- # Learning Objectives
118
- st.write("### Patentable Claims:")
119
- # Initialize autogenerated objectives
120
- claims_extraction = ""
121
- # Initialize status placeholder
122
- learning_status_placeholder = st.empty()
123
- disable_button_bool = False
124
-
125
- if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool):
126
- # Split the user input into chunks
127
- input_chunks = chunk_text(userinput)
128
-
129
- # Initialize a variable to store the extracted claims
130
- all_extracted_claims = ""
131
-
132
- for chunk in input_chunks:
133
- # Display status message for the current chunk
134
- learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...")
135
-
136
- # API call to generate objectives for the current chunk
137
- claims_extraction_response = openai.ChatCompletion.create(
138
- model=model_choice,
139
- messages=[
140
- {"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."}
141
- ]
142
- )
143
-
144
- # Extract the generated objectives from the API response
145
- claims_extraction = claims_extraction_response['choices'][0]['message']['content']
146
-
147
- # Append the extracted claims from the current chunk to the overall results
148
- all_extracted_claims += claims_extraction.strip()
149
-
150
- # Save the generated objectives to session state
151
- st.session_state.claims_extraction = all_extracted_claims
152
-
153
- # Display generated objectives for all chunks
154
- learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}")
155
-
156
-
157
- # Get the extracted claims from Streamlit's session state
158
- claims_extracted = st.session_state.claims_extraction
159
- # Display the Extracted Claims
160
- if 'claims_extracted' in st.session_state:
161
- st.text("Extracted Claims:")
162
- st.text(st.session_state.claims_extracted)
163
-
164
- # Define the BERT-based model name
165
- model_name = 'nlpaueb/legal-bert-base-uncased'
166
-
167
- # Initialize BERT-based model and tokenizer
168
- custom_config = AutoConfig.from_pretrained(model_name)
169
- custom_config.output_hidden_states = True
170
- custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
171
- custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
172
- bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
173
- print('Using model {}\n'.format(model_name))
174
-
175
- # Get the extracted claims from Streamlit's session state
176
- # claims_extracted = st.session_state.claims_extraction #moved up
177
-
178
- # Define the chunk size
179
- chunk_size = 350
180
-
181
- # Split the extracted claims into chunks
182
- if isinstance(claims_extracted, str):
183
- chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)]
184
- else:
185
- chunks = []
186
-
187
- # Process each chunk with the BERT-based model
188
- summaries = []
189
- for chunk in chunks:
190
- summary = bert_legal_model(chunk, min_length=8, ratio=0.05)
191
- summaries.append(summary)
192
-
193
- # Now you have a list of summaries for each chunk
194
- # You can access them using `summaries[0]`, `summaries[1]`, etc.
195
- # After generating summaries
196
- for i, summary in enumerate(summaries):
197
- st.write(f"### Summary {i+1}")
198
- st.write(summary)
199
-
200
- # Display the BERT Summaries
201
- if summaries:
202
- st.text("BERT Summaries:")
203
- for i, summary in enumerate(summaries):
204
- st.text(f"Summary {i + 1}:\n{summary}")
205
-
206
- # Citation for the GitHub repo
207
- st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & (Taylor)[https://huggingface.co/Cloudfaith] [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True)
 
1
+ import streamlit as st
2
+ import gradio as gr
3
+ import numpy as np
4
+ import whisper
5
+ import os
6
+ import streamlit.components.v1 as components
7
+ import tempfile
8
+ import io
9
+ import requests
10
+ import json
11
+ import openai
12
+ from transformers import AutoConfig, AutoTokenizer, AutoModel
13
+ from summarizer import Summarizer
14
+
15
+ # File upload size bug?
16
+
17
+ # st.set_option('server.maxUploadSize', 500)
18
+
19
+ # Initialize session state for claims_extraction
20
+ st.session_state.claims_extraction = ""
21
+
22
+ # Initialize session state for userinput
23
+ st.session_state.userinput = "" # Initialize user input
24
+
25
+ # Define a function to split text into chunks
26
+ def chunk_text(text, chunk_size=2000):
27
+ chunks = []
28
+ start = 0
29
+ while start < len(text):
30
+ end = start + chunk_size
31
+ chunk = text[start:end]
32
+ chunks.append(chunk)
33
+ start = end
34
+ return chunks
35
+
36
+ # Streamlit Session State
37
+ if 'learning_objectives' not in st.session_state:
38
+ st.session_state.learning_objectives = ""
39
+
40
+ # Initialize the Whisper model outside the button
41
+ if 'whisper_model' not in st.session_state:
42
+ st.session_state.whisper_model = whisper.load_model("base")
43
+
44
+ # Streamlit Interface
45
+
46
+ markdown_text = """
47
+ # 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor.
48
+ Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased).
49
+ - Save time and effort when ideating for your future business.
50
+ """
51
+
52
+ # Render the Markdown content
53
+ st.markdown(markdown_text)
54
+
55
+ # API Key Input
56
+ api_key = st.text_input("Enter your OpenAI API Key:", type="password")
57
+
58
+ # Audio Upload
59
+ st.write("Upload an audio file (supported formats: mp3, wav, ogg)")
60
+ audio_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"], key="audio_file")
61
+
62
+ audio_data = None
63
+
64
+ if audio_file is not None:
65
+ audio_data = audio_file.read()
66
+ st.audio(audio_data, format="audio/wav")
67
+ st.info("Transcribing...")
68
+ st.success("Transcription complete")
69
+
70
+ # Moved the submit_button check here
71
+ if st.button('Start Transcription'):
72
+ model = st.session_state.whisper_model
73
+
74
+ if audio_data:
75
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file:
76
+ audio_file.write(audio_data)
77
+ audio_file_path = audio_file.name
78
+ st.audio(audio_file_path, format="audio/wav")
79
+ st.info("Transcribing...")
80
+ st.success("Transcription complete")
81
+ result = model.transcribe(audio_file_path)
82
+ transcript = result['text'] # Define the 'transcript' variable
83
+
84
+ with st.expander("See transcript"):
85
+ st.markdown(transcript)
86
+
87
+ # Display the Whisper transcription
88
+ if 'transcript' in locals():
89
+ st.text("Transcription:")
90
+ st.text(transcript)
91
+
92
+ # Update the user input field with the transcription
93
+ st.session_state.userinput = st.text_area("Input Text:", transcript)
94
+
95
+ # Model Selection Dropdown
96
+ model_choice = st.selectbox(
97
+ "Select the model you want to use:",
98
+ ["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"]
99
+ )
100
+
101
+ # Context, Subject, and Level
102
+ context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable."
103
+
104
+ # Initialize OpenAI API
105
+ if api_key:
106
+ openai.api_key = api_key
107
+
108
+ # Learning Objectives
109
+ st.write("### Patentable Claims:")
110
+
111
+ # Initialize autogenerated objectives
112
+ claims_extraction = ""
113
+
114
+ # Initialize status placeholder
115
+ learning_status_placeholder = st.empty()
116
+
117
+ disable_button_bool = False
118
+
119
+ if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool):
120
+ # Split the user input into chunks
121
+ input_chunks = chunk_text(userinput)
122
+
123
+ # Initialize a variable to store the extracted claims
124
+ all_extracted_claims = ""
125
+
126
+ for chunk in input_chunks:
127
+ # Display status message for the current chunk
128
+ learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...")
129
+
130
+ # API call to generate objectives for the current chunk
131
+ claims_extraction_response = openai.ChatCompletion.create(
132
+ model=model_choice,
133
+ messages=[
134
+ {"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."}
135
+ ]
136
+ )
137
+
138
+ # Extract the generated objectives from the API response
139
+ claims_extraction = claims_extraction_response['choices'][0]['message']['content']
140
+
141
+ # Append the extracted claims from the current chunk to the overall results
142
+ all_extracted_claims += claims_extraction.strip()
143
+
144
+ # Save the generated objectives to session state
145
+ st.session_state.claims_extraction = all_extracted_claims
146
+
147
+ # Display generated objectives for all chunks
148
+ learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}")
149
+
150
+ # Get the extracted claims from Streamlit's session state
151
+ claims_extracted = st.session_state.claims_extraction
152
+
153
+ # Display the Extracted Claims
154
+ if 'claims_extracted' in st.session_state:
155
+ st.text("Extracted Claims:")
156
+ st.text(st.session_state.claims_extracted)
157
+
158
+ # Define the BERT-based model name
159
+ model_name = 'nlpaueb/legal-bert-base-uncased'
160
+
161
+ # Initialize BERT-based model and tokenizer
162
+ custom_config = AutoConfig.from_pretrained(model_name)
163
+ custom_config.output_hidden_states = True
164
+ custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
165
+ custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
166
+ bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
167
+ print('Using model {}\n'.format(model_name))
168
+
169
+ # Get the extracted claims from Streamlit's session state
170
+ # claims_extracted = st.session_state.claims_extraction #moved up
171
+
172
+ # Define the chunk size
173
+ chunk_size = 350
174
+
175
+ # Split the extracted claims into chunks
176
+ if isinstance(claims_extracted, str):
177
+ chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)]
178
+ else:
179
+ chunks = []
180
+
181
+ # Process each chunk with the BERT-based model
182
+ summaries = []
183
+ for chunk in chunks:
184
+ summary = bert_legal_model(chunk, min_length=8, ratio=0.05)
185
+ summaries.append(summary)
186
+
187
+ # Now you have a list of summaries for each chunk
188
+ # You can access them using `summaries[0]`, `summaries[1]`, etc.
189
+ # After generating summaries
190
+ for i, summary in enumerate(summaries):
191
+ st.write(f"### Summary {i+1}")
192
+ st.write(summary)
193
+
194
+ # Display the BERT Summaries
195
+ if summaries:
196
+ st.text("BERT Summaries:")
197
+ for i, summary in enumerate(summaries):
198
+ st.text(f"Summary {i + 1}:\n{summary}")
199
+
200
+ # Citation for the GitHub repo
201
+ st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & (Taylor)[https://huggingface.co/Cloudfaith] [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True)