File size: 7,404 Bytes
042d924
f4a3d63
3599308
798b27c
 
 
 
 
 
 
d8d1045
56541bd
78f44f2
51b2f63
78f44f2
51b2f63
29ba926
8a059ac
 
091728f
c89ce42
 
 
 
798b27c
9f4d7d7
 
 
 
 
 
 
 
 
 
56541bd
 
 
 
97729cf
 
 
 
cd6a559
56541bd
cd6a559
 
 
 
 
 
 
 
 
 
 
56541bd
 
 
f4a3d63
 
 
042d924
f4a3d63
 
042d924
f4a3d63
2213377
97729cf
798b27c
f4a3d63
 
 
 
29ba926
 
 
 
72be11a
042d924
29ba926
 
a2fe734
29ba926
516f6da
a2fe734
f4a3d63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516f6da
f4a3d63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2fe734
f4a3d63
 
 
 
 
a2fe734
29ba926
 
 
 
85e4d89
 
56541bd
85e4d89
 
56541bd
85e4d89
 
 
 
 
 
 
56541bd
85e4d89
 
a2fe734
85e4d89
 
a2fe734
85e4d89
6d5d07f
 
 
 
9f4d7d7
85e4d89
 
 
 
 
9f4d7d7
85e4d89
 
97729cf
 
 
 
77632a5
72be11a
 
e1c3522
72be11a
 
 
 
 
 
 
 
 
 
 
e1c3522
72be11a
77632a5
 
d5f038c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import streamlit as st
import gradio as gr
import numpy as np
import whisper
import os
import streamlit.components.v1 as components
import tempfile
import io
import requests
import json
import openai

# File upload size bug?

# st.set_option('server.maxUploadSize', 500)

# Initialize session state for claims_extraction
# if 'claims_extraction' not in st.session_state:
st.session_state.claims_extraction = ""

# Initialize session state for userinput
if 'userinput' not in st.session_state:
    st.session_state.userinput = ""  # Initialize user input

# Define a function to split text into chunks
def chunk_text(text, chunk_size=2000):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end
    return chunks

# Streamlit Session State
if 'learning_objectives' not in st.session_state:
    st.session_state.learning_objectives = ""

# Initialize the Whisper model outside the button
if 'whisper_model' not in st.session_state:
    st.session_state.whisper_model = whisper.load_model("base")

#StreamlitInterface

markdown_text = """
# 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor. 

Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased). 

- Save time and effort when ideating for your future business.

"""

# Render the Markdown content
st.markdown(markdown_text)
# API Key Input
api_key = st.text_input("Enter your OpenAI API Key:", type="password")

# Audio Upload
audio_file = st.file_uploader("Upload an audio file", type=["mp3", "wav", "ogg"])
audio_data = None

if audio_file is not None:
    audio_data = audio_file.read()

# Moved the submit_button check here
if st.button('Start Transcription'):
    model = st.session_state.whisper_model

    if audio_data:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file:
            audio_file.write(audio_data)
            audio_file_path = audio_file.name
        st.audio(audio_file_path, format="audio/wav")
        st.info("Transcribing...")
        st.success("Transcription complete")
        result = model.transcribe(audio_file_path)
        transcript = result['text']  # Define the 'transcript' variable

        with st.expander("See transcript"):
            st.markdown(transcript)

        # Update the user input field with the transcription
        st.session_state.userinput = st.text_area("Input Text:", transcript)

# Model Selection Dropdown
model_choice = st.selectbox(
    "Select the model you want to use:",
    ["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"]
)

# Context, Subject, and Level
context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable."
# userinput = st.text_input("Input Text:", "Freeform text here!")  # Commented out, as it's updated above

# Initialize OpenAI API
if api_key:
    openai.api_key = api_key

# Learning Objectives
st.write("### Patentable Claims:")
# Initialize autogenerated objectives
claims_extraction = ""
# Initialize status placeholder
learning_status_placeholder = st.empty()
disable_button_bool = False

if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool):
    # Split the user input into chunks
    input_chunks = chunk_text(userinput)

    # Initialize a variable to store the extracted claims
    all_extracted_claims = ""

    for chunk in input_chunks:
        # Display status message for the current chunk
        learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...")

        # API call to generate objectives for the current chunk
        claims_extraction_response = openai.ChatCompletion.create(
            model=model_choice,
            messages=[
                {"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."}
            ]
        )

        # Extract the generated objectives from the API response
        claims_extraction = claims_extraction_response['choices'][0]['message']['content']

        # Append the extracted claims from the current chunk to the overall results
        all_extracted_claims += claims_extraction.strip()

    # Save the generated objectives to session state
    st.session_state.claims_extraction = all_extracted_claims

    # Display generated objectives for all chunks
    learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}")


# Get the extracted claims from Streamlit's session state
claims_extracted = st.session_state.claims_extraction

from transformers import AutoConfig, AutoTokenizer, AutoModel
from summarizer import Summarizer

# Define the BERT-based model name
model_name = 'nlpaueb/legal-bert-base-uncased'

# Initialize BERT-based model and tokenizer
custom_config = AutoConfig.from_pretrained(model_name)
custom_config.output_hidden_states = True
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
print('Using model {}\n'.format(model_name))

# Get the extracted claims from Streamlit's session state
claims_extracted = st.session_state.claims_extraction

# Define the chunk size
chunk_size = 350

# Split the extracted claims into chunks
if isinstance(claims_extracted, str):
    chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)]
else:
    chunks = []

# Process each chunk with the BERT-based model
summaries = []
for chunk in chunks:
    summary = bert_legal_model(chunk, min_length=8, ratio=0.05)
    summaries.append(summary)

# Now you have a list of summaries for each chunk
# You can access them using `summaries[0]`, `summaries[1]`, etc.
# After generating summaries
for i, summary in enumerate(summaries):
    st.write(f"### Summary {i+1}")
    st.write(summary)

# Display the Whisper transcription, extracted claims, and BERT summaries in a text box
if 'transcript' in locals() and 'claims_extracted' in st.session_state:
    st.text("Transcription, Extracted Claims, and BERT Summaries:")

    if 'transcript' in locals():
        st.text("Transcription:")
        st.text(transcript)

    if 'claims_extracted' in st.session_state:
        st.text("Extracted Claims:")
        st.text(st.session_state.claims_extracted)

    if summaries:
        st.text("BERT Summaries:")
        for i, summary in enumerate(summaries):
            st.text(f"Summary {i + 1}:\n{summary}")

# Citation for the GitHub repo
st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & (Taylor)[https://huggingface.co/Cloudfaith] [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True)