| ''' | |
| This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata | |
| It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more | |
| information visit https://github.com/bizres | |
| ''' | |
| from pdfminer.high_level import extract_pages | |
| from pdfminer.layout import LTTextContainer | |
| from pdfminer.high_level import extract_text | |
| import fitz | |
| import langid | |
| langid.set_languages(['en', 'de','fr','it']) | |
| import pandas as pd | |
| def pdf_to_text(file): | |
| ''' | |
| This function extracts text from a pdf. | |
| Parameters: | |
| path: path to pdf | |
| ''' | |
| text = extract_text(file) | |
| paragraphs = text.split('\n\n') | |
| return paragraphs | |
| def detect_language(text): | |
| ''' | |
| This function detects the language of a text using langid | |
| ''' | |
| return langid.classify(text) | |
| def count_pages(pdf_file): | |
| return len(list(extract_pages(pdf_file))) | |
| def pdf_text_to_sections(text): | |
| ''' | |
| This function generates a pandas DataFrame from the extracted text. Each section | |
| is provided with the page it is on and a section_index | |
| ''' | |
| sections = [] | |
| page_nr = 0 | |
| section_index = 0 | |
| for page in text.split('\n\n'): | |
| page_nr += 1 | |
| for section in page.split('\n'): | |
| sections.append([page_nr, section_index, section]) | |
| section_index += 1 | |
| return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text']) | |