|
import streamlit as st |
|
from country_by_country.processor import ReportProcessor |
|
from utils import get_pdf_iframe, set_state, generate_assets |
|
from country_by_country.utils.utils import keep_pages |
|
from pypdf import PdfReader |
|
from menu import display_pages_menu, display_config |
|
|
|
import sys |
|
import copy |
|
import logging |
|
|
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s") |
|
|
|
ALL_TABLE_EXTRACTORS = { |
|
extractor["type"]: extractor |
|
for extractor in st.session_state["initial_config"]["table_extraction"] |
|
} |
|
|
|
|
|
def set_validate() -> None: |
|
st.session_state["validate_selected_pages"] = True |
|
|
|
|
|
def set_extractors() -> None: |
|
if st.session_state.get("extractor_keys") is None: |
|
return |
|
selected_extractors_dict = [ |
|
ALL_TABLE_EXTRACTORS[key] for key in st.session_state["extractor_keys"] |
|
] |
|
set_state(["config", "table_extraction"], selected_extractors_dict) |
|
st.session_state["proc"] = ReportProcessor(st.session_state["config"]) |
|
generate_assets() |
|
|
|
|
|
st.set_page_config(layout="wide", page_title="Pages selection") |
|
st.title("Country by Country Tax Reporting analysis : Selected Pages") |
|
st.subheader( |
|
"This page will allow you to select the pages containing your tables", |
|
) |
|
display_pages_menu() |
|
with st.sidebar: |
|
display_config() |
|
|
|
if "working_file_pdf" in st.session_state: |
|
|
|
col1, col2 = st.columns([1, 1]) |
|
|
|
with col2: |
|
|
|
pdfreader = PdfReader(st.session_state["working_file_pdf"]) |
|
number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages) |
|
logging.info("got the assets : " + str(st.session_state["assets"])) |
|
selected_pages = st.multiselect( |
|
"Which page of the following pdf contains the table you want to extract ? Defaults pages are the pages extracted by the decision tree algorithm", |
|
list(range(1, number_pages + 1)), |
|
placeholder="Select a page number", |
|
default=[ |
|
i + 1 |
|
for i in st.session_state["assets"]["pagefilter"]["selected_pages"] |
|
], |
|
disabled=True if "validate_selected_pages" in st.session_state else False, |
|
) |
|
|
|
|
|
current_table_extractors = [ |
|
extractor["type"] |
|
for extractor in st.session_state["config"]["table_extraction"] |
|
] |
|
extractor_keys = st.multiselect( |
|
"Extractors", |
|
key="extractor_keys", |
|
options=ALL_TABLE_EXTRACTORS.keys(), |
|
default=current_table_extractors, |
|
on_change=set_extractors, |
|
) |
|
|
|
submitted = st.button( |
|
label="Validate your selected pages", |
|
on_click=set_validate, |
|
) |
|
|
|
selected_pages = sorted(selected_pages) |
|
logging.info("Filtering the pdf with pages : " + str(selected_pages)) |
|
st.session_state["pdf_before_page_validation"] = keep_pages( |
|
st.session_state["working_file_pdf"].name, |
|
[i - 1 for i in selected_pages], |
|
) |
|
|
|
with col1: |
|
|
|
st.markdown( |
|
get_pdf_iframe(st.session_state["pdf_before_page_validation"]), |
|
unsafe_allow_html=True, |
|
) |
|
|
|
if submitted: |
|
|
|
|
|
st.session_state["assets"]["pagefilter"]["selected_pages"] = [ |
|
i - 1 for i in selected_pages |
|
] |
|
st.session_state["pdf_after_page_validation"] = keep_pages( |
|
st.session_state["working_file_pdf"].name, |
|
[i - 1 for i in selected_pages], |
|
) |
|
st.switch_page("pages/2_Metadata.py") |
|
|