File size: 4,609 Bytes
ec6dd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import logging
import sys
import tempfile

import streamlit as st
import yaml
import copy
from menu import display_pages_menu, display_config
from pypdf import PdfReader
from utils import get_pdf_iframe, set_state

from country_by_country.processor import ReportProcessor

logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")


def set_page_filter(value: dict):
    set_state(["config", "pagefilter"], value)


st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
st.title("Country by Country Tax Reporting analysis")
st.subheader(
    "This app will help you extract a table containing financial information from a pdf",
)
display_pages_menu()

mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)

with st.sidebar:

    st.markdown("# PDF Upload")

    st.markdown("## PDF Report to process")
    original_pdf = st.file_uploader(
        "Upload a pdf document containing financial table : ",
    )

    if original_pdf is not None:
        mytmpfile.write(original_pdf.read())
        st.session_state["working_file_pdf"] = mytmpfile
        st.session_state["original_pdf_name"] = original_pdf.name

    if "original_pdf_name" in st.session_state:
        st.markdown(
            "Already loaded file : " + st.session_state["original_pdf_name"],
        )

    st.markdown("# Configuration:\n")
    # Upload personalized config if required
    loaded_config = st.file_uploader(
        "Upload a config if the default config doesn't suit you :",
    )
    if loaded_config is not None:
        if not loaded_config.name.endswith(".yaml"):
            st.error("Please upload a yaml file")
            loaded_config = None

        try:
            loaded_config_dict = yaml.safe_load(loaded_config)
            if not (
                loaded_config_dict.get("pagefilter", False)
                and loaded_config_dict.get("table_extraction", False)
            ):
                st.error("Please upload a valid config file")
                loaded_config = None
        except yaml.YAMLError as e:
            st.error("Unable to load yaml file config")
            loaded_config = None

    # Extract config
    with open("extract_config.yaml", "r") as f:
        default_config = f.read()

    if not st.session_state.get("config_is_set", False):
        st.session_state["initial_config"] = yaml.safe_load(default_config)
        st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
        st.session_state["config_is_set"] = True

    if bool(loaded_config):
        st.session_state["initial_config"] = loaded_config_dict
        st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
        st.session_state["config_is_set"] = True

    # Set page filter
    page_filter_radio_dict = {
        pagefilter["type"]: pagefilter
        for pagefilter in st.session_state["initial_config"]["pagefilter"]
    }
    selected_page_filter = st.radio("Page filter", page_filter_radio_dict.keys())
    set_page_filter(page_filter_radio_dict[selected_page_filter])

    display_config()


if "working_file_pdf" in st.session_state:
    # Once a pdf has been uploaded, it will be stored as
    # the "original_pdf" key in the session state.
    # Hence, the following code will only be executed if a pdf has been uploaded.

    # Display the uploaded pdf
    st.markdown(
        get_pdf_iframe(st.session_state["working_file_pdf"].name),
        unsafe_allow_html=True,
    )

    if "first_time" not in st.session_state:
        st.session_state["first_time"] = False
        logging.info("Loading config and pdf")
        st.session_state["proc"] = ReportProcessor(st.session_state["config"])

        logging.info("Config and pdf loaded")

        assets = {
            "pagefilter": {},
            "table_extractors": [],
        }

        # Filtering the pages
        st.session_state["proc"].page_filter(
            st.session_state["working_file_pdf"].name,
            assets,
        )

        logging.info(f"Assets : {assets}")

        if len(assets["pagefilter"]["selected_pages"]) == 0:
            # No page has been automatically selected by the page filter
            # Hence, we display the full pdf, letting the user select the pages
            pdfreader = PdfReader(st.session_state["working_file_pdf"])
            number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
            assets["pagefilter"]["selected_pages"] = list(range(number_pages))
        st.session_state["assets"] = assets
        st.switch_page("pages/1_Selected_Pages.py")