Joan Giner commited on
Commit
5dddb18
·
1 Parent(s): cfbb0ad

open version

Browse files
Files changed (2) hide show
  1. app.py +8 -4
  2. src/extractor.py +14 -8
app.py CHANGED
@@ -21,8 +21,8 @@ from src.extractor import Extractor
21
  load_dotenv()
22
 
23
  ## You api key from vendors or hugginface
24
- openai.api_key=os.getenv("OPEN_AI_API_KEY")
25
- LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
26
  extractor = Extractor()
27
 
28
  # Define function to handle the Gradio interface
@@ -51,6 +51,10 @@ async def extraction(input_file, apikey, dimension):
51
  return results, completeness_report
52
 
53
  async def ui_extraction(input_file, apikey, dimension):
 
 
 
 
54
  file_name = input_file.name.split("/")[-1]
55
  results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
56
  # Build results in the correct format for the Gradio front-end
@@ -154,7 +158,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
154
 
155
  """)
156
  with gr.Column():
157
- apikey_elem = gr.Text(label="OpenAI API key (Not needed during review)")
158
  # gr.Markdown("""
159
  # <h3> Improving your data and assesing your dataset documentation </h3>
160
  # The generated warning also allows you quicly check the completeness of the documentation, and spotting gaps in the document
@@ -261,5 +265,5 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
261
 
262
  # Run the app
263
  #demo.queue(concurrency_count=5,max_size=20).launch()
264
- demo.launch(share=False,show_api=False,auth=("CIKM2023", "demodemo"))
265
 
 
21
  load_dotenv()
22
 
23
  ## You api key from vendors or hugginface
24
+ #openai.api_key=os.getenv("OPEN_AI_API_KEY")
25
+ #LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
26
  extractor = Extractor()
27
 
28
  # Define function to handle the Gradio interface
 
51
  return results, completeness_report
52
 
53
  async def ui_extraction(input_file, apikey, dimension):
54
+ if (input_file == None):
55
+ raise gr.Error("Please upload a your data paper")
56
+ if (input_file.name.split(".")[-1] != "pdf"):
57
+ raise gr.Error("This is not a data paper, please uploead it in .pdf format")
58
  file_name = input_file.name.split("/")[-1]
59
  results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
60
  # Build results in the correct format for the Gradio front-end
 
158
 
159
  """)
160
  with gr.Column():
161
+ apikey_elem = gr.Text(label="OpenAI API key")
162
  # gr.Markdown("""
163
  # <h3> Improving your data and assesing your dataset documentation </h3>
164
  # The generated warning also allows you quicly check the completeness of the documentation, and spotting gaps in the document
 
265
 
266
  # Run the app
267
  #demo.queue(concurrency_count=5,max_size=20).launch()
268
+ demo.launch(share=False,show_api=False)
269
 
src/extractor.py CHANGED
@@ -65,8 +65,11 @@ class Extractor:
65
 
66
  # Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
67
  def extract_text_from_pdf(self, file_path):
68
- article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
69
- print("PDF parsed")
 
 
 
70
  finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
71
  for section in article_dict['sections']:
72
  sec = section['heading'] + ": "
@@ -109,9 +112,9 @@ class Extractor:
109
  # Process text and get the embeddings
110
  vectorspath = "./vectors/"+file_name
111
  if not apikey:
112
- apikey = openai.api_key
113
- gr.Error("Please set your api key")
114
- embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)
115
  if os.path.isfile(vectorspath+"/index.faiss"):
116
 
117
  # file exists
@@ -147,9 +150,12 @@ class Extractor:
147
 
148
  def build_chains(self, apikey):
149
  if not apikey:
150
- apikey = openai.api_key
151
- gr.Error("Please set your api key")
152
- LLMClient = OpenAI(model_name='text-davinci-003',openai_api_key=apikey,temperature=0)
 
 
 
153
  ## In-context prompt
154
  prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
155
  Question: {question}
 
65
 
66
  # Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
67
  def extract_text_from_pdf(self, file_path):
68
+ try:
69
+ article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
70
+ print("PDF parsed")
71
+ except:
72
+ raise gr.Error("Error parsing PDF, please update your data paper in the correct format")
73
  finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
74
  for section in article_dict['sections']:
75
  sec = section['heading'] + ": "
 
112
  # Process text and get the embeddings
113
  vectorspath = "./vectors/"+file_name
114
  if not apikey:
115
+ #apikey = openai.api_key
116
+ raise gr.Error("Please set your api key")
117
+ embeddings = OpenAIEmbeddings(openai_api_key=apikey)
118
  if os.path.isfile(vectorspath+"/index.faiss"):
119
 
120
  # file exists
 
150
 
151
  def build_chains(self, apikey):
152
  if not apikey:
153
+ #apikey = openai.api_key
154
+ raise gr.Error("Please set your Api key")
155
+ try:
156
+ LLMClient = OpenAI(model_name='text-davinci-003',openai_api_key=apikey,temperature=0)
157
+ except:
158
+ raise gr.Error("Your Api key is not valid")
159
  ## In-context prompt
160
  prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
161
  Question: {question}