|
''' |
|
This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata |
|
|
|
It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more |
|
information visit https://github.com/bizres |
|
|
|
''' |
|
from pdfminer.high_level import extract_pages |
|
from pdfminer.layout import LTTextContainer |
|
from pdfminer.high_level import extract_text |
|
|
|
import fitz |
|
|
|
import langid |
|
langid.set_languages(['en', 'de','fr','it']) |
|
|
|
import pandas as pd |
|
|
|
def pdf_to_text(file): |
|
''' |
|
This function extracts text from a pdf. |
|
|
|
Parameters: |
|
path: path to pdf |
|
''' |
|
|
|
text = extract_text(file) |
|
paragraphs = text.split('\n\n') |
|
return paragraphs |
|
|
|
|
|
def detect_language(text): |
|
''' |
|
This function detects the language of a text using langid |
|
''' |
|
return langid.classify(text) |
|
|
|
def count_pages(pdf_file): |
|
return len(list(extract_pages(pdf_file))) |
|
|
|
def pdf_text_to_sections(text): |
|
''' |
|
This function generates a pandas DataFrame from the extracted text. Each section |
|
is provided with the page it is on and a section_index |
|
''' |
|
sections = [] |
|
page_nr = 0 |
|
section_index = 0 |
|
for page in text.split('\n\n'): |
|
page_nr += 1 |
|
for section in page.split('\n'): |
|
sections.append([page_nr, section_index, section]) |
|
section_index += 1 |
|
|
|
return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text']) |
|
|