taxobservatory-demo / pages /0_Import_File.py
Ronan
feat: add new filters
dd6a24d
import logging
import sys
import tempfile
import streamlit as st
import yaml
import copy
from menu import display_pages_menu, display_config
from pypdf import PdfReader
from utils import get_pdf_iframe, set_state, generate_assets
from country_by_country.processor import ReportProcessor
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
def set_page_filter(value: dict):
set_state(["config", "pagefilter"], value)
def initiate_configuration() -> None:
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
if isinstance(st.session_state["config"]["pagefilter"], list):
st.session_state["config"]["pagefilter"] = st.session_state["initial_config"][
"pagefilter"
][0]
st.session_state["selected_page_filter_name"] = st.session_state["config"][
"pagefilter"
]["type"]
def on_pdf_file_upload() -> None:
# Change states related to the pdf file upload
mytmpfile.write(st.session_state.original_pdf.read())
st.session_state["working_file_pdf"] = mytmpfile
st.session_state["original_pdf_name"] = st.session_state.original_pdf.name
# Generate assets
generate_assets()
st.session_state["page_redirection"] = "pages/1_Selected_Pages.py"
def on_config_file_upload() -> None:
st.session_state["initial_config"] = st.session_state["initial_uploaded_config"]
initiate_configuration()
def on_change_page_filter(name_to_filter_dict: dict) -> None:
st.session_state["selected_page_filter_name"] = st.session_state[
"radio_button_filter_selection"
] # this 'buffer' is needed because selectors wipe their key on reload
set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]])
# Check if a redirection was requested
# Workaround because st.switch_page is not allowed in a callback function
if st.session_state.get("page_redirection", False):
page_to_redirect_to = st.session_state["page_redirection"]
st.session_state["page_redirection"] = False
st.switch_page(page_to_redirect_to)
st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
st.title("Country by Country Tax Reporting analysis")
st.subheader(
"This app will help you extract a table containing financial information from a pdf",
)
display_pages_menu()
mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
# State initialization
if "first_time" not in st.session_state:
logging.info("State initialization...")
st.session_state["first_time"] = False
logging.info("... loading default extract config")
with open("extract_config.yaml", "r") as f:
st.session_state["initial_config"] = yaml.safe_load(f.read())
initiate_configuration()
logging.info("... initializing processor and assets")
st.session_state["proc"] = ReportProcessor(st.session_state["config"])
st.session_state["assets"] = {
"pagefilter": {},
"table_extractors": [],
}
with st.sidebar:
st.markdown("# PDF Upload")
st.markdown("## PDF Report to process")
original_pdf = st.file_uploader(
"Upload a pdf document containing financial table : ",
key="original_pdf",
on_change=on_pdf_file_upload,
)
if "original_pdf_name" in st.session_state:
st.markdown(
"Already loaded file : " + st.session_state["original_pdf_name"],
)
st.markdown("# Configuration:\n")
# Upload personalized config if required
loaded_config = st.file_uploader(
"Upload a config if the default config doesn't suit you :",
key="initial_uploaded_config",
on_change=initiate_configuration,
)
if loaded_config is not None:
if not loaded_config.name.endswith(".yaml"):
st.error("Please upload a yaml file")
loaded_config = None
try:
loaded_config_dict = yaml.safe_load(loaded_config)
if not (
loaded_config_dict.get("pagefilter", False)
and loaded_config_dict.get("table_extraction", False)
):
st.error("Please upload a valid config file")
loaded_config = None
except yaml.YAMLError as e:
st.error("Unable to load yaml file config")
loaded_config = None
# Extract config
if bool(loaded_config):
st.session_state["initial_config"] = loaded_config_dict
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
# Set page filter
page_filter_name_to_config_mapping = {
pagefilter["type"]: pagefilter
for pagefilter in st.session_state["initial_config"]["pagefilter"]
}
page_filter_list = list(page_filter_name_to_config_mapping.keys())
current_selected_page_filter_index = page_filter_list.index(
st.session_state["selected_page_filter_name"]
)
selected_page_filter_name = st.radio(
"Page filter",
page_filter_list,
index=current_selected_page_filter_index,
on_change=on_change_page_filter,
key="radio_button_filter_selection",
args=(page_filter_name_to_config_mapping,),
)
display_config()
if "working_file_pdf" in st.session_state:
# Once a pdf has been uploaded, it will be stored as
# the "original_pdf" key in the session state.
# Hence, the following code will only be executed if a pdf has been uploaded.
# Display the uploaded pdf
st.markdown(
get_pdf_iframe(st.session_state["working_file_pdf"].name),
unsafe_allow_html=True,
)