File size: 3,027 Bytes
8cc757b
c2e898c
32a276c
e4cf30a
3d99012
a5f44ac
3d99012
 
 
e5f4b72
3d99012
e5f4b72
3d99012
8cc757b
c2e898c
e4cf30a
03c65d9
8cc757b
e4cf30a
 
b97ca50
e4cf30a
 
5c947fb
f3c6f29
4c3c133
e4cf30a
d4fb40d
 
e4cf30a
c2e898c
e4cf30a
7c2b679
c2e898c
 
 
 
5d9a931
c2e898c
7935f47
5d9a931
3d99012
 
 
 
 
7935f47
3d99012
 
 
7935f47
395f535
5d9a931
d4fb40d
 
e4cf30a
d4fb40d
32a276c
03c65d9
e4cf30a
78f41eb
0b6ce8b
ef4afbb
 
d4fb40d
67b3ac5
ef4afbb
03c65d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
import pysbd
from transformers import pipeline
from sentence_transformers import CrossEncoder
from  transformers  import  AutoTokenizer, AutoModelWithLMHead, pipeline

model_name = "MaRiOrOsSi/t5-base-finetuned-question-answering"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)

#from transformers import pipeline

#text2text_generator = pipeline("text2text-generation", model = "gpt2")

sentence_segmenter = pysbd.Segmenter(language='en',clean=False)
passage_retreival_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
qa_model = pipeline("question-answering",'a-ware/bart-squadv2')

def fetch_answers(question, clincal_note ):
    clincal_note_paragraphs = clincal_note.splitlines()
    query_paragraph_list = [(question, para) for para in clincal_note_paragraphs if len(para.strip()) > 0 ]
    
    scores = passage_retreival_model.predict(query_paragraph_list)
    top_5_indices = scores.argsort()[-5:]
    top_5_query_paragraph_list = [query_paragraph_list[i] for i in top_5_indices ]
    top_5_query_paragraph_list.reverse()
    
    top_5_query_paragraph_answer_list = ""
    count = 1
    for query, passage in top_5_query_paragraph_list:
     passage_sentences = sentence_segmenter.segment(passage)
     answer = qa_model(question = query, context = passage)['answer']
     evidence_sentence = ""
     for i in range(len(passage_sentences)):
         if answer.startswith('.') or answer.startswith(':'):
             answer = answer[1:].strip()
         if answer in passage_sentences[i]:
             evidence_sentence = evidence_sentence + " " + passage_sentences[i]
     
                  
     model_input = f"question: {query} context: {evidence_sentence}"
     #output_answer = text2text_generator(model_input)[0]['generated_text']
     encoded_input = tokenizer([model_input],
                               return_tensors='pt',
                               max_length=512,
                               truncation=True)
                                     
     output = model.generate(input_ids = encoded_input.input_ids,
                             attention_mask = encoded_input.attention_mask)
     output_answer = tokenizer.decode(output[0], skip_special_tokens=True)

     result_str = "# ANSWER "+str(count)+": "+ output_answer +"\n"  
     result_str = result_str + "REFERENCE: "+ evidence_sentence + "\n\n"
     top_5_query_paragraph_answer_list += result_str
     count+=1
     
    return top_5_query_paragraph_answer_list

demo = gr.Interface(
    fn=fetch_answers,
    #take input as real time audio and use OPENAPI whisper for S2T
    #clinical note upload as file (.This is an example of simple text. or doc/docx file)
    inputs=[gr.Textbox(lines=2, label='Question', show_label=True),
            gr.Textbox(lines=10, label='Document Text', show_label=True)],
    outputs="markdown",
    examples='.',
    title='Document Question Answering System with Evidence from document'
)
demo.launch()