Spaces:

DataForGood
/

taxobservatory-demo

Sleeping

Ronan commited on Mar 29

Commit

ec6dd69

•

1 Parent(s): 5a0bc6c

feat: first commit

feat: add license

feat : add requirements.txt

fix: rm poetry dependency

fix:opencv version

fix requirements

add pillow-heif = "^0.15.0"

fix: sck learn

add packages.txt

rm camelot config

fix: dependency issues

fix altair version

add tesseract packages

rm tesseract-ocr-dev

fix: come back to aggrid 0.3.4

feat : update

comment cleaning part

rm dependency cleaning

add models

MAJ Hugging Face

fix: use app.py

update

UPDATE

fix: dont use app/ path

fix: path

add Extractable

fix: no ExtractTable

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -0
.streamlit/config.toml +2 -0
LICENSE +20 -0
README.md +3 -6
__pycache__/menu.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +3 -0
configs/test_full_workflow.yaml +16 -0
configs/v0.yaml +20 -0
country_by_country/.empty +0 -0
country_by_country/__init__.py +21 -0
country_by_country/__main__.py +67 -0
country_by_country/__pycache__/__init__.cpython-310.pyc +0 -0
country_by_country/__pycache__/__main__.cpython-310.pyc +0 -0
country_by_country/__pycache__/dash_demo.cpython-310.pyc +0 -0
country_by_country/__pycache__/dash_process_methods.cpython-310.pyc +0 -0
country_by_country/__pycache__/processor.cpython-310.pyc +0 -0
country_by_country/img_table_extraction/__pycache__/__init__.cpython-310.pyc +0 -0
country_by_country/img_table_extraction/__pycache__/camelot_extractor.cpython-310.pyc +0 -0
country_by_country/img_table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc +0 -0
country_by_country/img_table_extraction/__pycache__/unstructured.cpython-310.pyc +0 -0
country_by_country/models/decision_tree_model.joblib +0 -0
country_by_country/models/random_forest_country_names.pkl +0 -0
country_by_country/models/random_forest_keywords.pkl +0 -0
country_by_country/models/random_forest_model_high_false_positive.joblib +0 -0
country_by_country/models/random_forest_model_low_false_positive.joblib +0 -0
country_by_country/pagefilter/__init__.py +41 -0
country_by_country/pagefilter/__pycache__/__init__.cpython-310.pyc +0 -0
country_by_country/pagefilter/__pycache__/copy_as_is.cpython-310.pyc +0 -0
country_by_country/pagefilter/__pycache__/filter_pages.cpython-310.pyc +0 -0
country_by_country/pagefilter/__pycache__/from_filename.cpython-310.pyc +0 -0
country_by_country/pagefilter/__pycache__/rf_classifier.cpython-310.pyc +0 -0
country_by_country/pagefilter/copy_as_is.py +51 -0
country_by_country/pagefilter/from_filename.py +79 -0
country_by_country/pagefilter/rf_classifier.py +153 -0
country_by_country/processor.py +87 -0
country_by_country/table_cleaning/__init__.py +34 -0
country_by_country/table_cleaning/__pycache__/__init__.cpython-310.pyc +0 -0
country_by_country/table_cleaning/__pycache__/llm_cleaner.cpython-310.pyc +0 -0
country_by_country/table_cleaning/llm_cleaner.py +183 -0
country_by_country/table_extraction/__init__.py +61 -0
country_by_country/table_extraction/__pycache__/__init__.cpython-310.pyc +0 -0
country_by_country/table_extraction/__pycache__/camelot_extractor.cpython-310.pyc +0 -0
country_by_country/table_extraction/__pycache__/extract_table_api.cpython-310.pyc +0 -0
country_by_country/table_extraction/__pycache__/from_csv.cpython-310.pyc +0 -0
country_by_country/table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc +0 -0
country_by_country/table_extraction/__pycache__/unstructured.cpython-310.pyc +0 -0
country_by_country/table_extraction/__pycache__/unstructured_api.cpython-310.pyc +0 -0
country_by_country/table_extraction/camelot_extractor.py +57 -0
country_by_country/table_extraction/extract_table_api.py +63 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv*

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [client]
2	+ showSidebarNavigation = false

LICENSE ADDED Viewed

	@@ -0,0 +1,20 @@

+Copyright (c) 2015-2024 Data4Good
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,9 @@
 ---
-title: Taxobservatory Demo
-emoji: 📉
-colorFrom: yellow
-colorTo: blue
 sdk: streamlit
 sdk_version: 1.32.2
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TaxObservatory Demo
+colorFrom: red
+colorTo: green
 sdk: streamlit
 sdk_version: 1.32.2
 app_file: app.py
 pinned: false
 ---

__pycache__/menu.cpython-310.pyc ADDED Viewed

Binary file (1.37 kB). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.03 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import streamlit as st
2	+
3	+ st.switch_page("pages/0_Import_File.py")

configs/test_full_workflow.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+# Full workflow
+# Requires OpenAI API key and only works with table_extraction:Unstructured and r
+pagefilter:
+  type: FromFilename
+table_extraction:
+  - type: Unstructured
+    params:
+      pdf_image_dpi: 300
+      hi_res_model_name: "yolox"
+table_cleaning:
+  - type: LLM
+    params:
+      openai_model: "gpt-4-turbo-preview"

configs/v0.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+pagefilter:
+  type: RFClassifier
+  params:
+    modelfile: random_forest_model_low_false_positive.joblib
+table_extraction:
+  - type: Camelot
+    params:
+      flavor: stream
+  - type: Camelot
+    params:
+      flavor: lattice
+  - type: Unstructured
+    params:
+      hi_res_model_name: "yolox"
+      pdf_image_dpi: 300
+#    - type: LLamaParse
+#    - type: UnstructuredAPI
+# table_cleaning:

country_by_country/.empty ADDED Viewed

File without changes

country_by_country/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.

country_by_country/__main__.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Standard imports
+import logging
+import pickle
+import sys
+from pathlib import Path
+import yaml
+# Local imports
+from dotenv import load_dotenv
+from country_by_country import processor
+NUM_CLI_ARGS = 3
+def process_report(config: dict, pdf_filepath: str) -> None:
+    # Loading API keys from .env file
+    load_dotenv()
+    proc = processor.ReportProcessor(config)
+    return proc.process(pdf_filepath)
+if __name__ == "__main__":
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
+    if len(sys.argv) != NUM_CLI_ARGS:
+        logging.error("Usage : python -m country_by_country config.yaml report.pdf")
+        sys.exit(-1)
+    logging.info(f"\nLoading {sys.argv[1]}")
+    with Path(sys.argv[1]).open() as fh:
+        config = yaml.safe_load(fh)
+    assets = process_report(config, sys.argv[2])
+    # Save all the assets to disk
+    with Path("assets.pkl").open("wb") as fh:
+        pickle.dump(assets, fh)
+    logging.info(
+        "Assets dumped in assets.pkl. You can read then using : \n"
+        + "pickle.load(open('assets.pkl', 'rb'))",
+    )

country_by_country/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (176 Bytes). View file

country_by_country/__pycache__/__main__.cpython-310.pyc ADDED Viewed

Binary file (977 Bytes). View file

country_by_country/__pycache__/dash_demo.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

country_by_country/__pycache__/dash_process_methods.cpython-310.pyc ADDED Viewed

Binary file (6.62 kB). View file

country_by_country/__pycache__/processor.cpython-310.pyc ADDED Viewed

Binary file (1.38 kB). View file

country_by_country/img_table_extraction/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (699 Bytes). View file

country_by_country/img_table_extraction/__pycache__/camelot_extractor.cpython-310.pyc ADDED Viewed

Binary file (1.3 kB). View file

country_by_country/img_table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc ADDED Viewed

Binary file (1.65 kB). View file

country_by_country/img_table_extraction/__pycache__/unstructured.cpython-310.pyc ADDED Viewed

Binary file (1.84 kB). View file

country_by_country/models/decision_tree_model.joblib ADDED Viewed

Binary file (5.1 kB). View file

country_by_country/models/random_forest_country_names.pkl ADDED Viewed

Binary file (10.5 kB). View file

country_by_country/models/random_forest_keywords.pkl ADDED Viewed

Binary file (328 Bytes). View file

country_by_country/models/random_forest_model_high_false_positive.joblib ADDED Viewed

Binary file (21.1 kB). View file

country_by_country/models/random_forest_model_low_false_positive.joblib ADDED Viewed

Binary file (106 kB). View file

country_by_country/pagefilter/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Standard imports
+# Local imports
+from .copy_as_is import CopyAsIs
+from .from_filename import FromFilename
+from .rf_classifier import RFClassifier
+def from_config(config: dict) -> CopyAsIs | FromFilename:
+    filter_type = config["type"]
+    if "params" in config:
+        params = config["params"]
+    if filter_type == "CopyAsIs":
+        return CopyAsIs()
+    elif filter_type == "FromFilename":
+        return FromFilename()
+    elif filter_type == "RFClassifier":
+        return RFClassifier(**params)

country_by_country/pagefilter/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (630 Bytes). View file

country_by_country/pagefilter/__pycache__/copy_as_is.cpython-310.pyc ADDED Viewed

Binary file (1.13 kB). View file

country_by_country/pagefilter/__pycache__/filter_pages.cpython-310.pyc ADDED Viewed

Binary file (777 Bytes). View file

country_by_country/pagefilter/__pycache__/from_filename.cpython-310.pyc ADDED Viewed

Binary file (1.83 kB). View file

country_by_country/pagefilter/__pycache__/rf_classifier.cpython-310.pyc ADDED Viewed

Binary file (5.05 kB). View file

country_by_country/pagefilter/copy_as_is.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# External imports
+import pypdf
+class CopyAsIs:
+    """
+    Dummy filter just copying the source pdf to a target
+    temporary file
+    """
+    def __init__(self) -> None:
+        pass
+    def __call__(self, pdf_filepath: str, assets: dict) -> None:
+        """
+        Basically keeps all the pages of the original document
+        Writes assets:
+            src_pdf: the original pdf filepath
+            selected_pages : list of selected pages
+        """
+        reader = pypdf.PdfReader(pdf_filepath)
+        n_pages = len(reader.pages)
+        if assets is not None:
+            assets["pagefilter"] = {
+                "src_pdf": pdf_filepath,
+                "selected_pages": list(range(n_pages)),
+            }

country_by_country/pagefilter/from_filename.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Standard imports
+from pathlib import Path
+NUM_PAGE_FIELDS = 2
+class FromFilename:
+    """
+    Filtering from filename. This filter expects the filename
+    of the pdf contains either the page or a page range of interest
+    explicitely given in the filename as :
+        /dir/containing/the/filename_of_the_report_#1.pdf
+        /dif/containing/the/filename_of_the_report_#1-#2.pdf
+    where #1 is a single page
+          #1-#2 is a page range
+    """
+    def __init__(self) -> None:
+        pass
+    def __call__(self, pdf_filepath: str, assets: dict) -> None:
+        """
+        Reads and processes a pdf from its filepath
+        It writes the filtered pdf as a temporary pdf
+        The filepath of this temporary pdf is returned
+        Writes assets:
+            src_pdf: the original pdf filepath
+            target_pdf: the temporary target pdf filepath
+            selected_pages : list of selected pages
+        """
+        # Get the page or page range from the filename
+        src_filename = Path(pdf_filepath).name
+        # We remove the extension, split on "_" and keep the last field
+        pagefield = src_filename[:-4].split("_")[-1]
+        selected_pages = []
+        if pagefield.isnumeric():
+            selected_pages = [int(pagefield) - 1]
+        else:
+            pagefields = pagefield.split("-")
+            if (
+                len(pagefields) == NUM_PAGE_FIELDS
+                and pagefields[0].isnumeric()
+                and pagefields[1].isnumeric()
+            ):
+                selected_pages = list(range(int(pagefields[0]) - 1, int(pagefields[1])))
+        if assets is not None:
+            assets["pagefilter"] = {
+                "src_pdf": pdf_filepath,
+                "selected_pages": selected_pages,
+            }

country_by_country/pagefilter/rf_classifier.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Standard import
+import pickle
+import pkgutil
+import tempfile
+# External imports
+import joblib
+import numpy as np
+import pypdf
+class FeatureExtractor:
+    """
+    A class to extract the features of a page as required by the random forest
+    classifier
+    """
+    def __init__(self, keywords: list[str], all_country_names: list[str]) -> None:
+        """
+        Arguments:
+            keywords: the keywords to count from the page text content
+            all_country_names: the country names/flags to count in the page content
+        """
+        self.all_country_names = all_country_names
+        self.keywords = keywords
+    def number_country_names(self, text: str) -> int:
+        """
+        Computes and returns the total number of occurence of any of the the
+        country names
+        """
+        return sum([text.count(country) for country in self.all_country_names])
+    def keyword(self, text: str, keyword: str) -> int:
+        """
+        Computes and returns the number of occurence of the specific keyword
+        """
+        return text.count(keyword)
+    def __call__(self, text: str) -> np.array:
+        """
+        Extracts the feature vector from the text
+        The features we extract are:
+            - nb_country: the total number of country names in the page
+            - keywords: how many times a string in the list of keywords is contained in the page
+        A typical list of keywords is :
+            ["tax","countr","country by country","country-by-country","report","cbc",\
+            "revenu","transparen","ethic","incom","employ","benefi","asset","contrib",\
+            "profit","accrued","jurisdiction","sales","ebt","paid","stated","accu","tangible",\
+            "fte", "expense", "related","headcount","capital","turnover","retained","current",\
+            "plant","work","intragroup","remuneration","debt","contribution","per country"]
+        """
+        features = [self.number_country_names(text)]
+        features.extend([self.keyword(text, keyword_i) for keyword_i in self.keywords])
+        return features
+class RFClassifier:
+    """
+    RandomForest classifier of whether a page contains a CbCR table or not
+    This randomforest decides from the text content of the page and is unable
+    to detect a page where a CbCR table would be included as an image
+    """
+    def __init__(self, modelfile: str) -> None:
+        # Access the model bundled in the package
+        data = pkgutil.get_data(
+            "country_by_country",
+            f"models/{modelfile}",
+        )
+        keywords = pickle.loads(
+            pkgutil.get_data("country_by_country", "models/random_forest_keywords.pkl"),
+        ).split(",")
+        all_country_names = pickle.loads(
+            pkgutil.get_data(
+                "country_by_country",
+                "models/random_forest_country_names.pkl",
+            ),
+        )
+        self.feature_extractor = FeatureExtractor(keywords, all_country_names)
+        # Unpack the data in a temporary file that joblib can then load
+        with tempfile.NamedTemporaryFile("wb", delete=False) as fp:
+            fp.write(data)
+            fp.close()
+            self.clf = joblib.load(fp.name)
+    def __call__(self, pdf_filepath: str, assets: dict) -> None:
+        """
+        Reads and processes a pdf from its filepath
+        It writes the filtered pdf as a temporary pdf
+        The filepath of this temporary pdf is returned
+        Writes assets:
+            src_pdf: the original pdf filepath
+            target_pdf: the temporary target pdf filepath
+            selected_pages : List of int
+        """
+        reader = pypdf.PdfReader(pdf_filepath)
+        # Extract the features from all the pages
+        page_features = []
+        for p in reader.pages:
+            content = p.extract_text().lower()
+            page_features.append(self.feature_extractor(content))
+        # features is now num_pages x num_features_per_page
+        page_features = np.array(page_features)
+        n_pages, n_features_per_page = page_features.shape
+        # Concatenate the features of the previous page and the next page
+        # the random forest expects
+        # [features_page_{i-1}, features_page_{i}, features_pages_{i+1}]
+        features = np.zeros((n_pages, 3 * n_features_per_page))
+        features[1:, :n_features_per_page] = page_features[:-1]
+        features[:, n_features_per_page:-n_features_per_page] = page_features
+        features[:-1, -n_features_per_page:] = page_features[1:]
+        # Performs the prediction
+        predictions = self.clf.predict(features)
+        # And now we keep only the pages that have been selected
+        selected_pages = [ip for ip, keep_p in enumerate(predictions) if keep_p]
+        if assets is not None:
+            assets["pagefilter"] = {
+                "src_pdf": pdf_filepath,
+                "selected_pages": selected_pages,
+            }

country_by_country/processor.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Standard imports
+import logging
+# Local imports
+from . import pagefilter, table_extraction
+from .utils.utils import keep_pages
+class ReportProcessor:
+    def __init__(self, config: dict) -> None:
+        # Report filter
+        self.page_filter = pagefilter.from_config(config["pagefilter"])
+        self.table_extractors = []
+        self.table_cleaners = []
+        # Tables extraction
+        if "table_extraction" in config:
+            table_extractors = config["table_extraction"]
+            self.table_extractors = [
+                table_extraction.from_config(name) for name in table_extractors
+            ]
+            # Table cleaning & reformatting
+            # We can do this step only if we had table extraction algorithms
+            # otherwise, the assets will not be available
+            #if "table_cleaning" in config:
+            #    table_cleaners = config["table_cleaning"]
+            #    self.table_cleaners = [
+            #        table_cleaning.from_config(name) for name in table_cleaners
+            #    ]
+    def process(self, pdf_filepath: str) -> dict:
+        logging.info(f"Processing {pdf_filepath}")
+        assets = {
+            "pagefilter": {},
+            "table_extractors": [],
+            "table_cleaners": [],
+        }
+        # Identifying the pages to extract
+        self.page_filter(pdf_filepath, assets)
+        # Now that we identified the pages to be extracted, we extract them
+        # Note, in a GUI, we could ask the user to the change the content of
+        # assets["pagefilter"]["selected_pages"] before selecting the pages
+        pdf_to_process = keep_pages(
+            pdf_filepath,
+            assets["pagefilter"]["selected_pages"],
+        )
+        # Process the selected pages to detect the tables and extract
+        # their contents
+        for table_extractor in self.table_extractors:
+            new_asset = table_extractor(pdf_to_process)
+            assets["table_extractors"].append(new_asset)
+        # Give the parsed content to the cleaner stage for getting organized data
+        #for table_cleaner in self.table_cleaners:
+        #    for asset in assets["table_extractors"]:
+        #        new_asset = table_cleaner(asset)
+        #        assets["table_cleaners"].append(new_asset)
+        return assets

country_by_country/table_cleaning/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Local imports
+from .llm_cleaner import LLMCleaner
+def from_config(config: dict) -> LLMCleaner:
+    extractor_type = config["type"]
+    extractor_params = {}
+    if "params" in config:
+        extractor_params = config["params"]
+    if extractor_type == "LLM":
+        return LLMCleaner(**extractor_params)
+    return None

country_by_country/table_cleaning/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (497 Bytes). View file

country_by_country/table_cleaning/__pycache__/llm_cleaner.cpython-310.pyc ADDED Viewed

Binary file (5.25 kB). View file

country_by_country/table_cleaning/llm_cleaner.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Standard imports
+import logging
+import uuid
+import pandas as pd
+# External imports
+from IPython.display import display
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_openai import ChatOpenAI
+from country_by_country.utils import constants
+class LLMCleaner:
+    def __init__(self, **kwargs: dict) -> None:
+        """
+        Builds a table cleaner, by extracting clean data from tables
+        extracted during table extraction stage.
+        The kwargs given to the constructor are directly propagated
+        to the LLMCleaner constructor.
+        You are free to define any parameter LLMCleaner recognizes.
+        """
+        self.kwargs = kwargs
+        self.type = "llm_cleaner"
+        self.openai_model = self.kwargs["openai_model"]
+    def __call__(self, asset: dict) -> dict:
+        logging.info("\nKicking off cleaning stage...")
+        logging.info(f"Cleaning type: {self.type}, with params: {self.kwargs}")
+        logging.info(
+            f"Input extraction type: {asset['type']}, with params: {asset['params']}",
+        )
+        # Extract tables from previous stage
+        tables = asset["tables"]
+        logging.info(f"Pulling {len(tables)} tables from extraction stage")
+        # Convert tables to html to add to LLM prompt
+        html_tables = [table.to_html() for table in tables]
+        # Define our LLM model
+        model = ChatOpenAI(temperature=0, model=self.openai_model)
+        # ---------- CHAIN 1/2 - Pull countries from each table ----------
+        logging.info("Starting chain 1/2: extracting country names from tables")
+        # Output should have this model (a list of country names)
+        class CountryNames(BaseModel):
+            country_names: list[str] = Field(
+                description="Exhaustive list of countries with financial data in the table",
+                enum=constants.COUNTRIES,
+            )
+        # Output should be a JSON with above schema
+        parser1 = JsonOutputParser(pydantic_object=CountryNames)
+        # Prompt includes one extracted table and some JSON output formatting instructions
+        prompt1 = PromptTemplate(
+            template="Extract an exhaustive list of countries from the following table "
+            + "in html format:\n{table}\n{format_instructions}",
+            input_variables=["table"],
+            partial_variables={
+                "format_instructions": parser1.get_format_instructions(),
+            },
+        )
+        # Chain
+        chain1 = {"table": lambda x: x} | prompt1 | model | parser1
+        # Run it
+        responses1 = chain1.batch(html_tables, {"max_concurrency": 4})
+        # Extract country lists from responses
+        country_lists = [resp["country_names"] for resp in responses1]
+        # ---------- CHAIN 2/2 - Pull financial data for each country ----------
+        logging.info("Starting chain 2/2: extracting financial data from tables")
+        # Define country data model
+        class Country(BaseModel):
+            """Financial data about a country"""
+            jur_name: str = Field(..., description="Name of the country")
+            total_revenues: float | None = Field(None, description="Total revenues")
+            profit_before_tax: float | None = Field(
+                None,
+                description="Amount of profit (or loss) before tax",
+            )
+            tax_paid: float | None = Field(None, description="Income tax paid")
+            tax_accrued: float | None = Field(None, description="Accrued tax")
+            employees: float | None = Field(None, description="Number of employees")
+            stated_capital: float | None = Field(None, description="Stated capital")
+            accumulated_earnings: float | None = Field(
+                None,
+                description="Accumulated earnings",
+            )
+            tangible_assets: float | None = Field(
+                None,
+                description="Tangible assets other than cash and cash equivalent",
+            )
+        # Output should have this model (a list of country objects)
+        class Countries(BaseModel):
+            """Extracting financial data for each country"""
+            countries: list[Country]
+        # Output should be a JSON with above schema
+        parser2 = PydanticOutputParser(pydantic_object=Countries)
+        # Prompt includes one extracted table and some JSON output formatting instructions
+        template = (
+            """You are an assistant tasked with extracting financial """
+            + """data about {country_list} from the following table in html format:\n
+        {table}\n
+        {format_instructions}
+        """
+        )
+        # Set up prompt
+        prompt = PromptTemplate.from_template(
+            template,
+            partial_variables={
+                "format_instructions": parser2.get_format_instructions(),
+            },
+        )
+        # Chain
+        chain2 = (
+            {"table": lambda x: x[0], "country_list": lambda x: x[1]}
+            | prompt
+            | model.with_structured_output(Countries)
+        )
+        # Run it
+        responses2 = chain2.batch(
+            list(zip(html_tables, country_lists, strict=True)),
+            {"max_concurrency": 4},
+        )
+        # Merge the tables into one dataframe
+        df = pd.concat(
+            [pd.json_normalize(resp.dict()["countries"]) for resp in responses2],
+        ).reset_index(drop=True)
+        # Display
+        display(df)
+        # Create asset
+        new_asset = {
+            "id": uuid.uuid4(),
+            "type": self.type,
+            "params": self.kwargs,
+            "table": df,
+        }
+        return new_asset

country_by_country/table_extraction/__init__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Local imports
+import logging
+import sys
+from .camelot_extractor import Camelot
+from .from_csv import FromCSV
+from .llama_parse_extractor import LlamaParseExtractor
+from .unstructured import Unstructured
+from .unstructured_api import UnstructuredAPI
+logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
+def from_config(config: dict) -> Camelot:
+    extractor_type = config["type"]
+    extractor_params = {}
+    if "params" in config:
+        extractor_params = config["params"]
+    if extractor_type == "Camelot":
+        return Camelot(**extractor_params)
+    elif extractor_type == "FromCSV":
+        return FromCSV(**extractor_params)
+    elif extractor_type == "Unstructured":
+        return Unstructured(**extractor_params)
+    elif extractor_type == "UnstructuredAPI":
+        return UnstructuredAPI(**extractor_params)
+    elif extractor_type == "LlamaParse":
+        return LlamaParseExtractor(**extractor_params)
+    elif extractor_type == "ExtractTableAPI":
+        # This is for legacy support
+        # In order to be able to use ExtractTable
+        # for benchmarking
+        # Note: ExtractTable-py is not maintained anymore
+        # This is the reason why this case is handled in a specific way
+        from .extract_table_api import ExtractTableAPI
+        return ExtractTableAPI(**extractor_params)
+    else:
+        logging.info(f"There are no extractors of the type : {extractor_type}")

country_by_country/table_extraction/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.16 kB). View file

country_by_country/table_extraction/__pycache__/camelot_extractor.cpython-310.pyc ADDED Viewed

Binary file (1.3 kB). View file

country_by_country/table_extraction/__pycache__/extract_table_api.cpython-310.pyc ADDED Viewed

Binary file (1.69 kB). View file

country_by_country/table_extraction/__pycache__/from_csv.cpython-310.pyc ADDED Viewed

Binary file (1.24 kB). View file

country_by_country/table_extraction/__pycache__/llama_parse_extractor.cpython-310.pyc ADDED Viewed

Binary file (1.89 kB). View file

country_by_country/table_extraction/__pycache__/unstructured.cpython-310.pyc ADDED Viewed

Binary file (1.84 kB). View file

country_by_country/table_extraction/__pycache__/unstructured_api.cpython-310.pyc ADDED Viewed

Binary file (2.24 kB). View file

country_by_country/table_extraction/camelot_extractor.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Standard imports
+import logging
+import uuid
+# External imports
+import camelot
+class Camelot:
+    def __init__(self, flavor: str) -> None:
+        self.flavor = flavor
+        self.type = "camelot"
+    def __call__(self, pdf_filepath: str) -> dict:
+        """
+        Returns asset that contain:
+            tables: a list of pandas dataframe of the parsed tables
+        """
+        logging.info("\nKicking off extraction stage...")
+        logging.info(f"Extraction type: {self.type}, with params: {self.flavor}")
+        tables = camelot.read_pdf(pdf_filepath, flavor=self.flavor)
+        # Write the parsed tables into the assets
+        tables_list = [t.df for t in tables]
+        # Create asset
+        new_asset = {
+            "id": uuid.uuid4(),
+            "type": "camelot",
+            "params": {"flavor": self.flavor},
+            "tables": tables_list,
+        }
+        return new_asset

country_by_country/table_extraction/extract_table_api.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# MIT License
+#
+# Copyright (c) 2024 dataforgood
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Standard imports
+import os
+import uuid
+# External imports
+try:
+    from ExtractTable import ExtractTable
+except ImportError as e:
+    class ExtractTableModuleException(Exception):
+        def __init__(self) -> None:
+            super().__init__("You must install ExtractTable : pip install ExtractTable")
+    raise ExtractTableModuleException() from e
+class ExtractTableAPI:
+    def __init__(self) -> None:
+        api_key = os.getenv("EXTRACT_TABLE_API_KEY")
+        self.extract_table = ExtractTable(api_key)
+    def __call__(self, pdf_filepath: str) -> None:
+        """
+        Writes assets:
+            ntables: the number of detected tables
+            tables: a list of pandas dataframe of the parsed tables
+        """
+        tables_list = self.extract_table.process_file(
+            filepath=pdf_filepath,
+            pages="all",
+            output_format="df",
+        )
+        # Create asset
+        new_asset = {
+            "id": uuid.uuid4(),
+            "type": "ExtractTableAPI",
+            "tables": tables_list,
+        }
+        return new_asset