Spaces:
Sleeping
Sleeping
File size: 4,609 Bytes
ec6dd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import logging
import sys
import tempfile
import streamlit as st
import yaml
import copy
from menu import display_pages_menu, display_config
from pypdf import PdfReader
from utils import get_pdf_iframe, set_state
from country_by_country.processor import ReportProcessor
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
def set_page_filter(value: dict):
set_state(["config", "pagefilter"], value)
st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
st.title("Country by Country Tax Reporting analysis")
st.subheader(
"This app will help you extract a table containing financial information from a pdf",
)
display_pages_menu()
mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
with st.sidebar:
st.markdown("# PDF Upload")
st.markdown("## PDF Report to process")
original_pdf = st.file_uploader(
"Upload a pdf document containing financial table : ",
)
if original_pdf is not None:
mytmpfile.write(original_pdf.read())
st.session_state["working_file_pdf"] = mytmpfile
st.session_state["original_pdf_name"] = original_pdf.name
if "original_pdf_name" in st.session_state:
st.markdown(
"Already loaded file : " + st.session_state["original_pdf_name"],
)
st.markdown("# Configuration:\n")
# Upload personalized config if required
loaded_config = st.file_uploader(
"Upload a config if the default config doesn't suit you :",
)
if loaded_config is not None:
if not loaded_config.name.endswith(".yaml"):
st.error("Please upload a yaml file")
loaded_config = None
try:
loaded_config_dict = yaml.safe_load(loaded_config)
if not (
loaded_config_dict.get("pagefilter", False)
and loaded_config_dict.get("table_extraction", False)
):
st.error("Please upload a valid config file")
loaded_config = None
except yaml.YAMLError as e:
st.error("Unable to load yaml file config")
loaded_config = None
# Extract config
with open("extract_config.yaml", "r") as f:
default_config = f.read()
if not st.session_state.get("config_is_set", False):
st.session_state["initial_config"] = yaml.safe_load(default_config)
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
st.session_state["config_is_set"] = True
if bool(loaded_config):
st.session_state["initial_config"] = loaded_config_dict
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
st.session_state["config_is_set"] = True
# Set page filter
page_filter_radio_dict = {
pagefilter["type"]: pagefilter
for pagefilter in st.session_state["initial_config"]["pagefilter"]
}
selected_page_filter = st.radio("Page filter", page_filter_radio_dict.keys())
set_page_filter(page_filter_radio_dict[selected_page_filter])
display_config()
if "working_file_pdf" in st.session_state:
# Once a pdf has been uploaded, it will be stored as
# the "original_pdf" key in the session state.
# Hence, the following code will only be executed if a pdf has been uploaded.
# Display the uploaded pdf
st.markdown(
get_pdf_iframe(st.session_state["working_file_pdf"].name),
unsafe_allow_html=True,
)
if "first_time" not in st.session_state:
st.session_state["first_time"] = False
logging.info("Loading config and pdf")
st.session_state["proc"] = ReportProcessor(st.session_state["config"])
logging.info("Config and pdf loaded")
assets = {
"pagefilter": {},
"table_extractors": [],
}
# Filtering the pages
st.session_state["proc"].page_filter(
st.session_state["working_file_pdf"].name,
assets,
)
logging.info(f"Assets : {assets}")
if len(assets["pagefilter"]["selected_pages"]) == 0:
# No page has been automatically selected by the page filter
# Hence, we display the full pdf, letting the user select the pages
pdfreader = PdfReader(st.session_state["working_file_pdf"])
number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
assets["pagefilter"]["selected_pages"] = list(range(number_pages))
st.session_state["assets"] = assets
st.switch_page("pages/1_Selected_Pages.py")
|