File size: 4,403 Bytes
370ba10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
st.set_page_config(layout="wide")
from annotated_text import annotated_text, annotation
import fitz
import os
import chromadb
import uuid
from pathlib import Path

os.environ['OPENAI_API_KEY'] = os.environ['OPEN_API_KEY']
st.title("Contracts Summary ")
import pandas as pd

from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import spacy
# Load the English model from SpaCy
nlp = spacy.load("en_core_web_md")

def util_upload_file_and_return_list_docs(uploaded_files):
    #util_del_cwd()
    list_docs = []
    list_save_path = []
    for uploaded_file in uploaded_files:
        save_path = Path(os.getcwd(), uploaded_file.name)
        with open(save_path, mode='wb') as w:
            w.write(uploaded_file.getvalue())
        #print('save_path:', save_path)
        docs = fitz.open(save_path) 
        list_docs.append(docs)
        list_save_path.append(save_path)
    return(list_docs, list_save_path)

    
def util_get_list_page_and_passage(list_docs, list_save_path):
    #page_documents = []
    documents = []
    for ind_doc, docs in enumerate(list_docs):
        text = ''
        for txt_index, txt_page in enumerate(docs):
            text = text + txt_page.get_text()
        documents.append(text)    
    return(documents)
    

    

documents = []


def get_summary_single_doc(text):
    from langchain.llms import OpenAI
    from langchain.chains.summarize import load_summarize_chain
    from langchain.text_splitter import CharacterTextSplitter
    from langchain.prompts import PromptTemplate
    from langchain.llms import OpenAI
    from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
    LLM_KEY=os.environ.get("OPEN_API_KEY")
    text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=3000,
            chunk_overlap=20
        )
    #create the documents from list of texts
    texts = text_splitter.create_documents([text])
    prompt_template = """Write a concise summary of the following:
    {text}
    CONCISE SUMMARY:"""
    prompt = PromptTemplate.from_template(prompt_template)

    refine_template = (
        "Your job is to produce a final summary with key learnings\n"
        "We have provided an existing summary up to a certain point: {existing_answer}\n"
        "We have the opportunity to refine the existing summary"
        "(only if needed) with detailed context below.\n"
        "------------\n"
        "{text}\n"
        "------------\n"
        "Given the new context, refine the original summary"
        "If the context isn't useful, return the original summary."
    )
    refine_prompt = PromptTemplate.from_template(refine_template)

    #Define the LLM
    # here we are using OpenAI's ChatGPT
    from langchain.chat_models import ChatOpenAI
    model_name = "gpt-3.5-turbo"
    llm=ChatOpenAI(temperature=0, openai_api_key=LLM_KEY, model_name=model_name)

    refine_chain = load_summarize_chain(
        llm,
        chain_type="refine",
        question_prompt=prompt,
        refine_prompt=refine_prompt,
        return_intermediate_steps=True,
       
    )
    refine_outputs = refine_chain({'input_documents': texts})
    return(refine_outputs['output_text'])
    

with st.form("my_form"):
    multi = '''1. Download and Upload contract (PDF) .
    
    e.g. https://www.barc.gov.in/tenders/GCC-LPS.pdf
    
    e.g. https://www.montrosecounty.net/DocumentCenter/View/823/Sample-Construction-Contract
    '''
    st.markdown(multi)
    multi = '''2. Press Summary .'''
    st.markdown(multi)
    multi = '''
    ** Attempt is made for summary ** \n
    '''
    st.markdown(multi)
    #uploaded_file = st.file_uploader("Choose a file")  

    list_docs = []
    list_save_path = []
    uploaded_files = st.file_uploader("Choose file(s)", accept_multiple_files=True)
    submitted = st.form_submit_button("Summary")
    
    if submitted and (uploaded_files is not None):
        list_docs, list_save_path = util_upload_file_and_return_list_docs(uploaded_files)
        documents = util_get_list_page_and_passage(list_docs, list_save_path)
        for index, item in enumerate(documents):
            st.write('Summary' + str(index+1) +  ' :: ')
            st.write(get_summary_single_doc(item))