Spaces:
				
			
			
	
			
			
		Build error
		
	
	
	
			
			
	
	
	
	
		
		
		Build error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -48,34 +48,68 @@ if "vector_store" not in st.session_state: | |
| 48 | 
             
                st.session_state.vector_store = None
         | 
| 49 |  | 
| 50 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 51 | 
             
            # ----------------- Metadata Extraction -----------------
         | 
| 52 | 
             
            def extract_metadata_llm(pdf_path):
         | 
| 53 | 
            -
                """Extracts metadata using LLM  | 
| 54 |  | 
| 55 | 
             
                with pdfplumber.open(pdf_path) as pdf:
         | 
| 56 | 
             
                    first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
         | 
| 57 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 58 | 
             
                # Streamlit Debugging: Show extracted text
         | 
| 59 | 
            -
                st.subheader("π Extracted First Page Text  | 
| 60 | 
            -
                st.text_area(" | 
| 61 |  | 
| 62 | 
             
                # Define metadata prompt
         | 
| 63 | 
             
                metadata_prompt = PromptTemplate(
         | 
| 64 | 
            -
                    input_variables=["text"],
         | 
| 65 | 
             
                    template="""
         | 
| 66 | 
            -
                    Given the  | 
| 67 | 
            -
                    -  | 
| 68 | 
            -
                    -  | 
| 69 | 
            -
                    
         | 
|  | |
|  | |
|  | |
| 70 | 
             
                    Example output:
         | 
| 71 | 
            -
                    {
         | 
| 72 | 
             
                        "Title": "Example Paper Title",
         | 
| 73 | 
             
                        "Author": "John Doe, Jane Smith",
         | 
| 74 | 
             
                        "Emails": "[email protected], [email protected]",
         | 
| 75 | 
             
                        "Affiliations": "School of AI, University of Example"
         | 
| 76 | 
            -
                    }
         | 
| 77 | 
            -
             | 
| 78 | 
            -
                    Now, extract  | 
| 79 | 
             
                    {text}
         | 
| 80 | 
             
                    """
         | 
| 81 | 
             
                )
         | 
| @@ -85,10 +119,10 @@ def extract_metadata_llm(pdf_path): | |
| 85 |  | 
| 86 | 
             
                # Debugging: Log the LLM input
         | 
| 87 | 
             
                st.subheader("π LLM Input for Metadata Extraction")
         | 
| 88 | 
            -
                st.json({"text":  | 
| 89 |  | 
| 90 | 
             
                try:
         | 
| 91 | 
            -
                    metadata_response = metadata_chain.invoke({"text":  | 
| 92 |  | 
| 93 | 
             
                    # Debugging: Log raw LLM response
         | 
| 94 | 
             
                    st.subheader("π Raw LLM Response")
         | 
| @@ -103,7 +137,7 @@ def extract_metadata_llm(pdf_path): | |
| 103 | 
             
                            metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
         | 
| 104 | 
             
                        except json.JSONDecodeError:
         | 
| 105 | 
             
                            metadata_dict = {
         | 
| 106 | 
            -
                                "Title":  | 
| 107 | 
             
                                "Author": "Unknown",
         | 
| 108 | 
             
                                "Emails": "No emails found",
         | 
| 109 | 
             
                                "Affiliations": "No affiliations found"
         | 
| @@ -112,7 +146,7 @@ def extract_metadata_llm(pdf_path): | |
| 112 | 
             
                except Exception as e:
         | 
| 113 | 
             
                    st.error(f"β LLM Metadata Extraction Failed: {e}")
         | 
| 114 | 
             
                    metadata_dict = {
         | 
| 115 | 
            -
                        "Title":  | 
| 116 | 
             
                        "Author": "Unknown",
         | 
| 117 | 
             
                        "Emails": "No emails found",
         | 
| 118 | 
             
                        "Affiliations": "No affiliations found"
         | 
| @@ -128,6 +162,8 @@ def extract_metadata_llm(pdf_path): | |
| 128 | 
             
                st.json(metadata_dict)
         | 
| 129 |  | 
| 130 | 
             
                return metadata_dict
         | 
|  | |
|  | |
| 131 |  | 
| 132 |  | 
| 133 | 
             
            # ----------------- Step 1: Choose PDF Source -----------------
         | 
|  | |
| 48 | 
             
                st.session_state.vector_store = None
         | 
| 49 |  | 
| 50 |  | 
| 51 | 
            +
            # ----------------- Text Cleaning Functions -----------------
         | 
| 52 | 
            +
            def clean_extracted_text(text):
         | 
| 53 | 
            +
                """
         | 
| 54 | 
            +
                Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
         | 
| 55 | 
            +
                """
         | 
| 56 | 
            +
                text = re.sub(r'\n+', '\n', text)  # Remove excessive newlines
         | 
| 57 | 
            +
                text = re.sub(r'\s{2,}', ' ', text)  # Remove extra spaces
         | 
| 58 | 
            +
                text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)  # Fix hyphenated words split by a newline
         | 
| 59 | 
            +
                return text.strip()
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            def extract_title_manually(text):
         | 
| 62 | 
            +
                """
         | 
| 63 | 
            +
                Attempts to find the title by checking the first few lines.
         | 
| 64 | 
            +
                - Titles are usually long enough (more than 5 words).
         | 
| 65 | 
            +
                - Ignores common header text like "Abstract", "Introduction".
         | 
| 66 | 
            +
                """
         | 
| 67 | 
            +
                lines = text.split("\n")
         | 
| 68 | 
            +
                ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
         | 
| 69 | 
            +
                
         | 
| 70 | 
            +
                for line in lines[:5]:  # Check only the first 5 lines
         | 
| 71 | 
            +
                    clean_line = line.strip()
         | 
| 72 | 
            +
                    if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
         | 
| 73 | 
            +
                        return clean_line  # Return first valid title
         | 
| 74 | 
            +
                return "Unknown"
         | 
| 75 | 
            +
             | 
| 76 | 
             
            # ----------------- Metadata Extraction -----------------
         | 
| 77 | 
             
            def extract_metadata_llm(pdf_path):
         | 
| 78 | 
            +
                """Extracts metadata using LLM with improved title detection and JSON handling."""
         | 
| 79 |  | 
| 80 | 
             
                with pdfplumber.open(pdf_path) as pdf:
         | 
| 81 | 
             
                    first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
         | 
| 82 |  | 
| 83 | 
            +
                # Apply text cleaning
         | 
| 84 | 
            +
                cleaned_text = clean_extracted_text(first_page_text)
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                # Attempt manual title extraction before LLM
         | 
| 87 | 
            +
                pre_extracted_title = extract_title_manually(cleaned_text)
         | 
| 88 | 
            +
             | 
| 89 | 
             
                # Streamlit Debugging: Show extracted text
         | 
| 90 | 
            +
                st.subheader("π Extracted First Page Text (Cleaned)")
         | 
| 91 | 
            +
                st.text_area("Cleaned Text:", cleaned_text, height=200)
         | 
| 92 |  | 
| 93 | 
             
                # Define metadata prompt
         | 
| 94 | 
             
                metadata_prompt = PromptTemplate(
         | 
| 95 | 
            +
                    input_variables=["text", "pre_title"],
         | 
| 96 | 
             
                    template="""
         | 
| 97 | 
            +
                    Given the first page of a research paper, extract metadata **strictly in JSON format**.
         | 
| 98 | 
            +
                    - The title is typically in the first few lines and is often in a larger font or bold.
         | 
| 99 | 
            +
                    - If a phrase like "Short Paper:" appears, the actual title follows.
         | 
| 100 | 
            +
                    - If no clear title is found, use the pre-extracted title: "{pre_title}".
         | 
| 101 | 
            +
                    - If a field is missing, return `"Unknown"`.
         | 
| 102 | 
            +
                    - Ensure the JSON format is **valid**.
         | 
| 103 | 
            +
             | 
| 104 | 
             
                    Example output:
         | 
| 105 | 
            +
                    {{
         | 
| 106 | 
             
                        "Title": "Example Paper Title",
         | 
| 107 | 
             
                        "Author": "John Doe, Jane Smith",
         | 
| 108 | 
             
                        "Emails": "[email protected], [email protected]",
         | 
| 109 | 
             
                        "Affiliations": "School of AI, University of Example"
         | 
| 110 | 
            +
                    }}
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    Now, extract metadata from this document:
         | 
| 113 | 
             
                    {text}
         | 
| 114 | 
             
                    """
         | 
| 115 | 
             
                )
         | 
|  | |
| 119 |  | 
| 120 | 
             
                # Debugging: Log the LLM input
         | 
| 121 | 
             
                st.subheader("π LLM Input for Metadata Extraction")
         | 
| 122 | 
            +
                st.json({"text": cleaned_text, "pre_title": pre_extracted_title})
         | 
| 123 |  | 
| 124 | 
             
                try:
         | 
| 125 | 
            +
                    metadata_response = metadata_chain.invoke({"text": cleaned_text, "pre_title": pre_extracted_title})
         | 
| 126 |  | 
| 127 | 
             
                    # Debugging: Log raw LLM response
         | 
| 128 | 
             
                    st.subheader("π Raw LLM Response")
         | 
|  | |
| 137 | 
             
                            metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
         | 
| 138 | 
             
                        except json.JSONDecodeError:
         | 
| 139 | 
             
                            metadata_dict = {
         | 
| 140 | 
            +
                                "Title": pre_extracted_title,  # Use pre-extracted title as fallback
         | 
| 141 | 
             
                                "Author": "Unknown",
         | 
| 142 | 
             
                                "Emails": "No emails found",
         | 
| 143 | 
             
                                "Affiliations": "No affiliations found"
         | 
|  | |
| 146 | 
             
                except Exception as e:
         | 
| 147 | 
             
                    st.error(f"β LLM Metadata Extraction Failed: {e}")
         | 
| 148 | 
             
                    metadata_dict = {
         | 
| 149 | 
            +
                        "Title": pre_extracted_title,  # Use pre-extracted title
         | 
| 150 | 
             
                        "Author": "Unknown",
         | 
| 151 | 
             
                        "Emails": "No emails found",
         | 
| 152 | 
             
                        "Affiliations": "No affiliations found"
         | 
|  | |
| 162 | 
             
                st.json(metadata_dict)
         | 
| 163 |  | 
| 164 | 
             
                return metadata_dict
         | 
| 165 | 
            +
                    
         | 
| 166 | 
            +
             | 
| 167 |  | 
| 168 |  | 
| 169 | 
             
            # ----------------- Step 1: Choose PDF Source -----------------
         | 
