Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Clement Vachet
		
	commited on
		
		
					Commit 
							
							·
						
						577e81d
	
1
								Parent(s):
							
							799497c
								
style: clean code
Browse files
    	
        app.py
    CHANGED
    
    | @@ -36,14 +36,10 @@ list_llm_simple = [os.path.basename(llm) for llm in list_llm] | |
| 36 |  | 
| 37 | 
             
            # Load PDF document and create doc splits
         | 
| 38 | 
             
            def load_doc(list_file_path, chunk_size, chunk_overlap):
         | 
| 39 | 
            -
                # Processing for one document only
         | 
| 40 | 
            -
                # loader = PyPDFLoader(file_path)
         | 
| 41 | 
            -
                # pages = loader.load()
         | 
| 42 | 
             
                loaders = [PyPDFLoader(x) for x in list_file_path]
         | 
| 43 | 
             
                pages = []
         | 
| 44 | 
             
                for loader in loaders:
         | 
| 45 | 
             
                    pages.extend(loader.load())
         | 
| 46 | 
            -
                # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
         | 
| 47 | 
             
                text_splitter = RecursiveCharacterTextSplitter(
         | 
| 48 | 
             
                    chunk_size = chunk_size, 
         | 
| 49 | 
             
                    chunk_overlap = chunk_overlap)
         | 
| @@ -77,26 +73,6 @@ def load_db(): | |
| 77 | 
             
            # Initialize langchain LLM chain
         | 
| 78 | 
             
            def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
         | 
| 79 | 
             
                progress(0.1, desc="Initializing HF tokenizer...")
         | 
| 80 | 
            -
                # HuggingFacePipeline uses local model
         | 
| 81 | 
            -
                # Note: it will download model locally...
         | 
| 82 | 
            -
                # tokenizer=AutoTokenizer.from_pretrained(llm_model)
         | 
| 83 | 
            -
                # progress(0.5, desc="Initializing HF pipeline...")
         | 
| 84 | 
            -
                # pipeline=transformers.pipeline(
         | 
| 85 | 
            -
                #     "text-generation",
         | 
| 86 | 
            -
                #     model=llm_model,
         | 
| 87 | 
            -
                #     tokenizer=tokenizer,
         | 
| 88 | 
            -
                #     torch_dtype=torch.bfloat16,
         | 
| 89 | 
            -
                #     trust_remote_code=True,
         | 
| 90 | 
            -
                #     device_map="auto",
         | 
| 91 | 
            -
                #     # max_length=1024,
         | 
| 92 | 
            -
                #     max_new_tokens=max_tokens,
         | 
| 93 | 
            -
                #     do_sample=True,
         | 
| 94 | 
            -
                #     top_k=top_k,
         | 
| 95 | 
            -
                #     num_return_sequences=1,
         | 
| 96 | 
            -
                #     eos_token_id=tokenizer.eos_token_id
         | 
| 97 | 
            -
                #     )
         | 
| 98 | 
            -
                # llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature': temperature})
         | 
| 99 | 
            -
                
         | 
| 100 | 
             
                # HuggingFaceHub uses HF inference endpoints
         | 
| 101 | 
             
                progress(0.5, desc="Initializing HF Hub...")
         | 
| 102 | 
             
                # Use of trust_remote_code as model_kwargs
         | 
| @@ -268,16 +244,6 @@ def conversation(qa_chain, message, history): | |
| 268 | 
             
                return qa_chain, gr.update(value=""), new_history, response_source1, response_source1_page, response_source2, response_source2_page, response_source3, response_source3_page
         | 
| 269 |  | 
| 270 |  | 
| 271 | 
            -
            def upload_file(file_obj):
         | 
| 272 | 
            -
                list_file_path = []
         | 
| 273 | 
            -
                for idx, file in enumerate(file_obj):
         | 
| 274 | 
            -
                    file_path = file_obj.name
         | 
| 275 | 
            -
                    list_file_path.append(file_path)
         | 
| 276 | 
            -
                # print(file_path)
         | 
| 277 | 
            -
                # initialize_database(file_path, progress)
         | 
| 278 | 
            -
                return list_file_path
         | 
| 279 | 
            -
             | 
| 280 | 
            -
             | 
| 281 | 
             
            def demo():
         | 
| 282 | 
             
                with gr.Blocks(theme="base") as demo:
         | 
| 283 | 
             
                    vector_db = gr.State()
         | 
| @@ -297,7 +263,6 @@ def demo(): | |
| 297 | 
             
                    with gr.Tab("Step 1 - Upload PDF"):
         | 
| 298 | 
             
                        with gr.Row():
         | 
| 299 | 
             
                            document = gr.File(height=200, file_count="multiple", file_types=[".pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
         | 
| 300 | 
            -
                            # upload_btn = gr.UploadButton("Loading document...", height=100, file_count="multiple", file_types=["pdf"], scale=1)
         | 
| 301 |  | 
| 302 | 
             
                    with gr.Tab("Step 2 - Process document"):
         | 
| 303 | 
             
                        with gr.Row():
         | 
| @@ -347,7 +312,6 @@ def demo(): | |
| 347 | 
             
                            clear_btn = gr.ClearButton(components=[msg, chatbot], value="Clear conversation")
         | 
| 348 |  | 
| 349 | 
             
                    # Preprocessing events
         | 
| 350 | 
            -
                    #upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
         | 
| 351 | 
             
                    db_btn.click(initialize_database, \
         | 
| 352 | 
             
                        inputs=[document, slider_chunk_size, slider_chunk_overlap], \
         | 
| 353 | 
             
                        outputs=[vector_db, collection_name, db_progress])
         | 
|  | |
| 36 |  | 
| 37 | 
             
            # Load PDF document and create doc splits
         | 
| 38 | 
             
            def load_doc(list_file_path, chunk_size, chunk_overlap):
         | 
|  | |
|  | |
|  | |
| 39 | 
             
                loaders = [PyPDFLoader(x) for x in list_file_path]
         | 
| 40 | 
             
                pages = []
         | 
| 41 | 
             
                for loader in loaders:
         | 
| 42 | 
             
                    pages.extend(loader.load())
         | 
|  | |
| 43 | 
             
                text_splitter = RecursiveCharacterTextSplitter(
         | 
| 44 | 
             
                    chunk_size = chunk_size, 
         | 
| 45 | 
             
                    chunk_overlap = chunk_overlap)
         | 
|  | |
| 73 | 
             
            # Initialize langchain LLM chain
         | 
| 74 | 
             
            def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
         | 
| 75 | 
             
                progress(0.1, desc="Initializing HF tokenizer...")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 76 | 
             
                # HuggingFaceHub uses HF inference endpoints
         | 
| 77 | 
             
                progress(0.5, desc="Initializing HF Hub...")
         | 
| 78 | 
             
                # Use of trust_remote_code as model_kwargs
         | 
|  | |
| 244 | 
             
                return qa_chain, gr.update(value=""), new_history, response_source1, response_source1_page, response_source2, response_source2_page, response_source3, response_source3_page
         | 
| 245 |  | 
| 246 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 247 | 
             
            def demo():
         | 
| 248 | 
             
                with gr.Blocks(theme="base") as demo:
         | 
| 249 | 
             
                    vector_db = gr.State()
         | 
|  | |
| 263 | 
             
                    with gr.Tab("Step 1 - Upload PDF"):
         | 
| 264 | 
             
                        with gr.Row():
         | 
| 265 | 
             
                            document = gr.File(height=200, file_count="multiple", file_types=[".pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
         | 
|  | |
| 266 |  | 
| 267 | 
             
                    with gr.Tab("Step 2 - Process document"):
         | 
| 268 | 
             
                        with gr.Row():
         | 
|  | |
| 312 | 
             
                            clear_btn = gr.ClearButton(components=[msg, chatbot], value="Clear conversation")
         | 
| 313 |  | 
| 314 | 
             
                    # Preprocessing events
         | 
|  | |
| 315 | 
             
                    db_btn.click(initialize_database, \
         | 
| 316 | 
             
                        inputs=[document, slider_chunk_size, slider_chunk_overlap], \
         | 
| 317 | 
             
                        outputs=[vector_db, collection_name, db_progress])
         |