Spaces:

crocidoc
/

testitest

Sleeping

testitest / text_transformation_tools.py

initial commit

cc83a1d about 3 years ago

1.44 kB

	'''
	This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata

	It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more
	information visit https://github.com/bizres

	'''
	from pdfminer.high_level import extract_pages
	from pdfminer.layout import LTTextContainer
	from pdfminer.high_level import extract_text

	import fitz

	import langid
	langid.set_languages(['en', 'de','fr','it'])

	import pandas as pd

	def pdf_to_text(file):
	'''
	This function extracts text from a pdf.

	Parameters:
	path: path to pdf
	'''

	text = extract_text(file)
	paragraphs = text.split('\n\n')
	return paragraphs


	def detect_language(text):
	'''
	This function detects the language of a text using langid
	'''
	return langid.classify(text)

	def count_pages(pdf_file):
	return len(list(extract_pages(pdf_file)))

	def pdf_text_to_sections(text):
	'''
	This function generates a pandas DataFrame from the extracted text. Each section
	is provided with the page it is on and a section_index
	'''
	sections = []
	page_nr = 0
	section_index = 0
	for page in text.split('\n\n'):
	page_nr += 1
	for section in page.split('\n'):
	sections.append([page_nr, section_index, section])
	section_index += 1

	return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text'])