seanpedrickcase commited on
Commit
96ac47b
·
1 Parent(s): bbf844d

Made example display conditional on example file existence. Turned example display off by default. Removed (mostly) unnecessary multi-os-test workflow

Browse files
.github/workflows/{multi-os-test.yml → archive_workflows/multi-os-test.yml} RENAMED
File without changes
app.py CHANGED
@@ -1008,70 +1008,88 @@ with app:
1008
  ###
1009
  with gr.Tab("Redact PDFs/images"):
1010
 
 
1011
  # Examples for PDF/image redaction
1012
  if SHOW_EXAMPLES == "True":
1013
  gr.Markdown(
1014
  "### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
1015
  )
1016
- redaction_examples = gr.Examples(
1017
- examples=[
1018
- [
1019
- [
1020
- "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
1021
- ],
1022
- "Local model - selectable text",
1023
- "Local",
1024
- [],
1025
- CHOSEN_REDACT_ENTITIES,
1026
- CHOSEN_COMPREHEND_ENTITIES,
1027
- [
1028
- "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
1029
- ],
1030
- ],
1031
- [
1032
- ["example_data/example_complaint_letter.jpg"],
1033
- "Local OCR model - PDFs without selectable text",
1034
- "Local",
1035
- [],
1036
- CHOSEN_REDACT_ENTITIES,
1037
- CHOSEN_COMPREHEND_ENTITIES,
1038
- ["example_data/example_complaint_letter.jpg"],
1039
- ],
1040
- [
1041
- ["example_data/graduate-job-example-cover-letter.pdf"],
1042
- "Local OCR model - PDFs without selectable text",
1043
- "Local",
1044
- [],
1045
- ["TITLES", "PERSON", "DATE_TIME"],
1046
- CHOSEN_COMPREHEND_ENTITIES,
1047
- ["example_data/graduate-job-example-cover-letter.pdf"],
1048
- ],
1049
- [
1050
- ["example_data/Partnership-Agreement-Toolkit_0_0.pdf"],
1051
- "AWS Textract service - all PDF types",
1052
- "AWS Comprehend",
1053
- ["Extract handwriting", "Extract signatures"],
1054
- CHOSEN_REDACT_ENTITIES,
1055
- CHOSEN_COMPREHEND_ENTITIES,
1056
- ["example_data/Partnership-Agreement-Toolkit_0_0.pdf"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1057
  ],
1058
- ],
1059
- inputs=[
1060
- in_doc_files,
1061
- text_extract_method_radio,
1062
- pii_identification_method_drop,
1063
- handwrite_signature_checkbox,
1064
- in_redact_entities,
1065
- in_redact_comprehend_entities,
1066
- prepared_pdf_state,
1067
- ],
1068
- example_labels=[
1069
- "PDF with selectable text redaction",
1070
- "Image redaction with local OCR",
1071
- "PDF redaction with custom entities (TITLES, PERSON, DATE_TIME)",
1072
- "PDF redaction with AWS services and signature detection",
1073
- ],
1074
- )
1075
 
1076
  with gr.Accordion("Redact document", open=True):
1077
  # in_doc_files = gr.File(
@@ -1651,36 +1669,37 @@ with app:
1651
  gr.Markdown(
1652
  "### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
1653
  )
1654
- duplicate_examples = gr.Examples(
1655
- examples=[
1656
- [
 
 
 
 
1657
  [
1658
- "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
 
 
 
1659
  ],
1660
- 0.95,
1661
- 10,
1662
- True,
1663
- ],
1664
- [
1665
  [
1666
- "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
 
 
 
1667
  ],
1668
- 0.95,
1669
- 3,
1670
- False,
1671
  ],
1672
- ],
1673
- inputs=[
1674
- in_duplicate_pages,
1675
- duplicate_threshold_input,
1676
- min_word_count_input,
1677
- combine_page_text_for_duplicates_bool,
1678
- ],
1679
- example_labels=[
1680
- "Find duplicate pages of text in document OCR outputs",
1681
- "Find duplicate text lines in document OCR outputs",
1682
- ],
1683
- )
1684
 
1685
  with gr.Accordion("Step 1: Configure and run analysis", open=True):
1686
  # in_duplicate_pages = gr.File(
@@ -1810,47 +1829,65 @@ with app:
1810
  gr.Markdown(
1811
  "### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
1812
  )
1813
- tabular_examples = gr.Examples(
1814
- examples=[
1815
- [
1816
- ["example_data/combined_case_notes.csv"],
1817
- ["Case Note", "Client"],
1818
- "Local",
1819
- "replace with 'REDACTED'",
1820
- ["example_data/combined_case_notes.csv"],
1821
- ["Case Note"],
1822
- ],
1823
- [
1824
- ["example_data/Bold minimalist professional cover letter.docx"],
1825
- [],
1826
- "Local",
1827
- "replace with 'REDACTED'",
1828
- [],
1829
- [],
1830
- ],
1831
- [
1832
- ["example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"],
1833
- ["text"],
1834
- "Local",
1835
- "replace with 'REDACTED'",
1836
- ["example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"],
1837
- ["text"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1838
  ],
1839
- ],
1840
- inputs=[
1841
- in_data_files,
1842
- in_colnames,
1843
- pii_identification_method_drop_tabular,
1844
- anon_strategy,
1845
- in_tabular_duplicate_files,
1846
- tabular_text_columns,
1847
- ],
1848
- example_labels=[
1849
- "CSV file redaction with specific columns - remove text",
1850
- "Word document redaction - replace with REDACTED",
1851
- "Tabular duplicate detection in CSV files",
1852
- ],
1853
- )
1854
 
1855
  with gr.Accordion("Redact Word or Excel/csv files", open=True):
1856
  with gr.Accordion("Upload docx, xlsx, or csv files", open=True):
@@ -6385,3 +6422,21 @@ if __name__ == "__main__":
6385
 
6386
  # Run the CLI main function with direct mode arguments
6387
  main(direct_mode_args=direct_mode_args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
  ###
1009
  with gr.Tab("Redact PDFs/images"):
1010
 
1011
+ # Examples for PDF/image redaction
1012
  # Examples for PDF/image redaction
1013
  if SHOW_EXAMPLES == "True":
1014
  gr.Markdown(
1015
  "### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
1016
  )
1017
+
1018
+ # Check which example files exist and create examples only for available files
1019
+ example_files = [
1020
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
1021
+ "example_data/example_complaint_letter.jpg",
1022
+ "example_data/graduate-job-example-cover-letter.pdf",
1023
+ "example_data/Partnership-Agreement-Toolkit_0_0.pdf"
1024
+ ]
1025
+
1026
+ available_examples = []
1027
+ example_labels = []
1028
+
1029
+ # Check each example file and add to examples if it exists
1030
+ if os.path.exists(example_files[0]):
1031
+ available_examples.append([
1032
+ [example_files[0]],
1033
+ "Local model - selectable text",
1034
+ "Local",
1035
+ [],
1036
+ CHOSEN_REDACT_ENTITIES,
1037
+ CHOSEN_COMPREHEND_ENTITIES,
1038
+ [example_files[0]],
1039
+ ])
1040
+ example_labels.append("PDF with selectable text redaction")
1041
+
1042
+ if os.path.exists(example_files[1]):
1043
+ available_examples.append([
1044
+ [example_files[1]],
1045
+ "Local OCR model - PDFs without selectable text",
1046
+ "Local",
1047
+ [],
1048
+ CHOSEN_REDACT_ENTITIES,
1049
+ CHOSEN_COMPREHEND_ENTITIES,
1050
+ [example_files[1]],
1051
+ ])
1052
+ example_labels.append("Image redaction with local OCR")
1053
+
1054
+ if os.path.exists(example_files[2]):
1055
+ available_examples.append([
1056
+ [example_files[2]],
1057
+ "Local OCR model - PDFs without selectable text",
1058
+ "Local",
1059
+ [],
1060
+ ["TITLES", "PERSON", "DATE_TIME"],
1061
+ CHOSEN_COMPREHEND_ENTITIES,
1062
+ [example_files[2]],
1063
+ ])
1064
+ example_labels.append("PDF redaction with custom entities (TITLES, PERSON, DATE_TIME)")
1065
+
1066
+ if os.path.exists(example_files[3]):
1067
+ available_examples.append([
1068
+ [example_files[3]],
1069
+ "AWS Textract service - all PDF types",
1070
+ "AWS Comprehend",
1071
+ ["Extract handwriting", "Extract signatures"],
1072
+ CHOSEN_REDACT_ENTITIES,
1073
+ CHOSEN_COMPREHEND_ENTITIES,
1074
+ [example_files[3]],
1075
+ ])
1076
+ example_labels.append("PDF redaction with AWS services and signature detection")
1077
+
1078
+ # Only create examples if we have available files
1079
+ if available_examples:
1080
+ redaction_examples = gr.Examples(
1081
+ examples=available_examples,
1082
+ inputs=[
1083
+ in_doc_files,
1084
+ text_extract_method_radio,
1085
+ pii_identification_method_drop,
1086
+ handwrite_signature_checkbox,
1087
+ in_redact_entities,
1088
+ in_redact_comprehend_entities,
1089
+ prepared_pdf_state,
1090
  ],
1091
+ example_labels=example_labels,
1092
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1093
 
1094
  with gr.Accordion("Redact document", open=True):
1095
  # in_doc_files = gr.File(
 
1669
  gr.Markdown(
1670
  "### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
1671
  )
1672
+
1673
+ # Check if duplicate example file exists
1674
+ duplicate_example_file = "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
1675
+
1676
+ if os.path.exists(duplicate_example_file):
1677
+ duplicate_examples = gr.Examples(
1678
+ examples=[
1679
  [
1680
+ [duplicate_example_file],
1681
+ 0.95,
1682
+ 10,
1683
+ True,
1684
  ],
 
 
 
 
 
1685
  [
1686
+ [duplicate_example_file],
1687
+ 0.95,
1688
+ 3,
1689
+ False,
1690
  ],
 
 
 
1691
  ],
1692
+ inputs=[
1693
+ in_duplicate_pages,
1694
+ duplicate_threshold_input,
1695
+ min_word_count_input,
1696
+ combine_page_text_for_duplicates_bool,
1697
+ ],
1698
+ example_labels=[
1699
+ "Find duplicate pages of text in document OCR outputs",
1700
+ "Find duplicate text lines in document OCR outputs",
1701
+ ],
1702
+ )
 
1703
 
1704
  with gr.Accordion("Step 1: Configure and run analysis", open=True):
1705
  # in_duplicate_pages = gr.File(
 
1829
  gr.Markdown(
1830
  "### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
1831
  )
1832
+
1833
+ # Check which tabular example files exist
1834
+ tabular_example_files = [
1835
+ "example_data/combined_case_notes.csv",
1836
+ "example_data/Bold minimalist professional cover letter.docx",
1837
+ "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
1838
+ ]
1839
+
1840
+ available_tabular_examples = []
1841
+ tabular_example_labels = []
1842
+
1843
+ # Check each tabular example file and add to examples if it exists
1844
+ if os.path.exists(tabular_example_files[0]):
1845
+ available_tabular_examples.append([
1846
+ [tabular_example_files[0]],
1847
+ ["Case Note", "Client"],
1848
+ "Local",
1849
+ "replace with 'REDACTED'",
1850
+ [tabular_example_files[0]],
1851
+ ["Case Note"],
1852
+ ])
1853
+ tabular_example_labels.append("CSV file redaction with specific columns - remove text")
1854
+
1855
+ if os.path.exists(tabular_example_files[1]):
1856
+ available_tabular_examples.append([
1857
+ [tabular_example_files[1]],
1858
+ [],
1859
+ "Local",
1860
+ "replace with 'REDACTED'",
1861
+ [],
1862
+ [],
1863
+ ])
1864
+ tabular_example_labels.append("Word document redaction - replace with REDACTED")
1865
+
1866
+ if os.path.exists(tabular_example_files[2]):
1867
+ available_tabular_examples.append([
1868
+ [tabular_example_files[2]],
1869
+ ["text"],
1870
+ "Local",
1871
+ "replace with 'REDACTED'",
1872
+ [tabular_example_files[2]],
1873
+ ["text"],
1874
+ ])
1875
+ tabular_example_labels.append("Tabular duplicate detection in CSV files")
1876
+
1877
+ # Only create examples if we have available files
1878
+ if available_tabular_examples:
1879
+ tabular_examples = gr.Examples(
1880
+ examples=available_tabular_examples,
1881
+ inputs=[
1882
+ in_data_files,
1883
+ in_colnames,
1884
+ pii_identification_method_drop_tabular,
1885
+ anon_strategy,
1886
+ in_tabular_duplicate_files,
1887
+ tabular_text_columns,
1888
  ],
1889
+ example_labels=tabular_example_labels,
1890
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
1891
 
1892
  with gr.Accordion("Redact Word or Excel/csv files", open=True):
1893
  with gr.Accordion("Upload docx, xlsx, or csv files", open=True):
 
6422
 
6423
  # Run the CLI main function with direct mode arguments
6424
  main(direct_mode_args=direct_mode_args)
6425
+
6426
+
6427
+ # Combine extraction options
6428
+ extraction_options = (
6429
+ list(direct_mode_args["handwrite_signature_extraction"])
6430
+ if direct_mode_args["handwrite_signature_extraction"]
6431
+ else []
6432
+ )
6433
+ if direct_mode_args["extract_forms"]:
6434
+ extraction_options.append("Extract forms")
6435
+ if direct_mode_args["extract_tables"]:
6436
+ extraction_options.append("Extract tables")
6437
+ if direct_mode_args["extract_layout"]:
6438
+ extraction_options.append("Extract layout")
6439
+ direct_mode_args["handwrite_signature_extraction"] = extraction_options
6440
+
6441
+ # Run the CLI main function with direct mode arguments
6442
+ main(direct_mode_args=direct_mode_args)
tools/config.py CHANGED
@@ -543,7 +543,7 @@ except Exception as e:
543
  # Get some environment variables and Launch the Gradio app
544
  COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
545
 
546
- SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "True")
547
 
548
  RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
549
 
 
543
  # Get some environment variables and Launch the Gradio app
544
  COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
545
 
546
+ SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "False")
547
 
548
  RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
549