Spaces:

ethzanalytics
/

dialog-China

Runtime error

App Files Files Community

jonathanlehner commited on Dec 29, 2021

Commit

8c7c98a

•

1 Parent(s): 2ff1e50

added dialoggpt

Browse files

Files changed (9) hide show

.gitignore +36 -0
Pipfile +21 -0
README 2.md +38 -0
ai_single_response.py +278 -0
app.py +196 -0
config.json +34 -0
file_test.py +3 -0
requirements.txt +101 -0
utils.py +282 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,36 @@

+# python basics
+/__pycache__/
+/.idea/
+/scratch/
+# local model folders for testing / running bots / deploy
+/gpt2_std_gpu_774M_120ksteps/
+/gpt2_std_gpu_774M_60ksteps/
+/gpt2_dailydialogue_355M_75Ksteps/
+/gp2_DDandPeterTexts_14kPeter_774M/
+/gp2_DDandPeterTexts_41kPeter-774M/
+/gp2_DDandPeterTexts_774M_73Ksteps/
+/gp2_DDandPeterTexts_gpu_774M_175Ksteps/
+*checkpoint*
+*GPT2*
+*GPTneo*
+*GPTpeter*
+*1pt3B*
+# most of ^ can be downloaded through `download_models.py`
+# gradio things
+*.db
+*.db-journal
+*gradio_queue*
+gradio_data
+deploy-as-bot/flagged
+deploy-as-bot/gradio_data
+deploy-as-bot/gradio_queue.db
+# notebooks containing personal data
+.DS_Store
+aitextgen

Pipfile ADDED Viewed

	@@ -0,0 +1,21 @@

+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+[packages]
+natsort = "==7.1.1"
+pandas = "==1.3.0"
+symspellpy = "==6.7.0"
+requests = "==2.24.0"
+transformers = "==4.8.2"
+gradio = "==1.7.7"
+tqdm = "==4.43.0"
+aitextgen = "==0.5.2"
+cleantext = "==1.1.3"
+telegram = "==0.0.1"
+[dev-packages]
+[requires]
+python_version = "3.8"

README 2.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+title: Ai Msgbot Gpt2 M XL
+emoji: 📉
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+app_file: app.py
+pinned: false
+---
+# Configuration
+`title`: _string_
+Display title for the Space
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio` or `streamlit`
+`sdk_version` : _string_
+Only applicable for `streamlit` SDK.
+See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code).
+Path is relative to the root of the repository.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

ai_single_response.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+ai_single_response.py
+An executable way to call the model. example:
+*\gpt2_chatbot> python .\ai_single_response.py --prompt "where is the grocery store?" --time
+extended-summary:
+    A system and method for interacting with a virtual machine using a series of messages , each message having associated otherwise one or more actions to be taken by the machine. The speaker participates in a chat with a responder , and the response from the responder is returned.
+"""
+import argparse
+import pprint as pp
+import time
+import warnings
+from datetime import datetime
+from pathlib import Path
+from cleantext import clean
+warnings.filterwarnings(action="ignore", message=".*gradient_checkpointing*")
+from aitextgen import aitextgen
+def query_gpt_model(
+    folder_path,
+    prompt_msg: str,
+    speaker=None,
+    responder="person beta",
+    kparam=150,
+    temp=0.75,
+    top_p=0.65,
+    verbose=False,
+    use_gpu=False,
+):
+    """
+    query_gpt_model [pass a prompt in to model, get a response. Does NOT "remember" past conversation]
+    Args:
+        folder_path ([type]): [description]
+        prompt_msg (str): [description]
+        speaker ([type], optional): [description]. Defaults to None.
+        responder (str, optional): [description]. Defaults to "person beta".
+        kparam (int, optional): [description]. Defaults to 125.
+        temp (float, optional): [description]. Defaults to 0.75.
+        top_p (float, optional): [description]. Defaults to 0.65.
+        verbose (bool, optional): [description]. Defaults to False.
+        use_gpu (bool, optional): [description]. Defaults to False.
+    Returns:
+        [dict]: [returns a dict with A) just model response as str B) total conversation]
+    """
+    ai = aitextgen(
+        model="microsoft/DialoGPT-medium",
+        #model_folder=folder_path,
+        to_gpu=False,
+    )
+    print("loaded model")
+    p_list = []
+    if "natqa" in str(folder_path).lower():
+        speaker = "person alpha"  # manual correction
+        responder = "person beta"
+    if "wow" in str(folder_path).lower():
+        speaker = "person alpha"  # manual correction
+        responder = "person beta"
+    if "peter" in str(folder_path).lower():
+        speaker = None  # manual correction
+        responder = "peter szemraj"
+    if speaker is not None:
+        p_list.append(speaker.lower() + ":" + "\n")  # write prompt as the speaker
+    p_list.append(prompt_msg.lower() + "\n")
+    p_list.append("\n")
+    p_list.append(responder.lower() + ":" + "\n")
+    this_prompt = "".join(p_list)
+    if verbose:
+        print("overall prompt:\n")
+        pp.pprint(this_prompt, indent=4)
+    print("\n... generating... \n")
+    this_result = ai.generate(
+        n=1,
+        top_k=kparam,
+        batch_size=512,
+        max_length=128,
+        min_length=16,
+        prompt=this_prompt,
+        temperature=temp,
+        top_p=top_p,
+        do_sample=True,
+        return_as_list=True,
+        use_cache=True,
+    )
+    if verbose:
+        pp.pprint(this_result)  # to see what is going on
+    try:
+        this_result = str(this_result[0]).split("\n")
+        res_out = [clean(ele) for ele in this_result]
+        p_out = [clean(ele) for ele in p_list]
+        if verbose:
+            pp.pprint(res_out)  # to see what is going on
+            pp.pprint(p_out)  # to see what is going on
+        diff_list = []
+        name_counter = 0
+        break_safe = False
+        for resline in res_out:
+            if (responder + ":") in resline:
+                name_counter += 1
+                break_safe = True  # next line a response from bot
+                continue
+            if ":" in resline and name_counter > 0:
+                if break_safe:
+                    diff_list.append(resline)
+                    break_safe = False
+                else:
+                    break
+            if resline in p_out:
+                break_safe = False
+                continue
+            else:
+                diff_list.append(resline)
+                break_safe = False
+        if verbose:
+            print("------------------------diff list: ")
+            pp.pprint(diff_list)  # to see what is going on
+            print("---------------------------------")
+        output = ", ".join(diff_list)
+    except:
+        output = "oops, there was an error. try again"
+    p_list.append(output + "\n")
+    p_list.append("\n")
+    model_responses = {"out_text": output, "full_conv": p_list}
+    print("finished!\n")
+    return model_responses
+# Set up the parsing of command-line arguments
+def get_parser():
+    """
+    get_parser [a helper function for the argparse module]
+    Returns:
+        [argparse.ArgumentParser]: [the argparser relevant for this script]
+    """
+    parser = argparse.ArgumentParser(
+        description="submit a message and have a 774M parameter GPT model respond"
+    )
+    parser.add_argument(
+        "--prompt",
+        required=True,  # MUST HAVE A PROMPT
+        type=str,
+        help="the message the bot is supposed to respond to. Prompt is said by speaker, answered by responder.",
+    )
+    parser.add_argument(
+        "--model",
+        required=False,
+        type=str,
+        # "gp2_DDandPeterTexts_774M_73Ksteps", - from GPT-Peter
+        default="GPT2_trivNatQAdailydia_774M_175Ksteps",
+        help="folder - with respect to git directory of your repo that has the model files in it (pytorch.bin + "
+        "config.json). No models? Run the script download_models.py",
+    )
+    parser.add_argument(
+        "--speaker",
+        required=False,
+        default=None,
+        help="Who the prompt is from (to the bot). Primarily relevant to bots trained on multi-individual chat data",
+    )
+    parser.add_argument(
+        "--responder",
+        required=False,
+        default="person beta",
+        help="who the responder is. Primarily relevant to bots trained on multi-individual chat data",
+    )
+    parser.add_argument(
+        "--topk",
+        required=False,
+        type=int,
+        default=150,
+        help="how many responses to sample (positive integer). lower = more random responses",
+    )
+    parser.add_argument(
+        "--temp",
+        required=False,
+        type=float,
+        default=0.75,
+        help="specify temperature hyperparam (0-1). roughly considered as 'model creativity'",
+    )
+    parser.add_argument(
+        "--topp",
+        required=False,
+        type=float,
+        default=0.65,
+        help="nucleus sampling frac (0-1). aka: what fraction of possible options are considered?",
+    )
+    parser.add_argument(
+        "--verbose",
+        default=False,
+        action="store_true",
+        help="pass this argument if you want all the printouts",
+    )
+    parser.add_argument(
+        "--time",
+        default=False,
+        action="store_true",
+        help="pass this argument if you want to know runtime",
+    )
+    return parser
+if __name__ == "__main__":
+    args = get_parser().parse_args()
+    query = args.prompt
+    model_dir = str(args.model)
+    model_loc = Path.cwd() / model_dir
+    spkr = args.speaker
+    rspndr = args.responder
+    k_results = args.topk
+    my_temp = args.temp
+    my_top_p = args.topp
+    want_verbose = args.verbose
+    want_rt = args.time
+    # force-update the speaker+responder params for the generic model case
+    if "dailydialogue" in model_dir.lower():
+        spkr = "john smith"
+        rspndr = "nancy sellers"
+        # ^ arbitrary people created when parsing Daily Dialogue dataset
+        # # force-update the speaker+responder params
+        # for the generic model case
+    if "natqa" in model_dir.lower():
+        spkr = "person alpha"
+        rspndr = "person beta"
+        # ^ arbitrary people created when parsing NatQA + TriviaQA + Daily Dialogue datasets
+    st = time.time()
+    resp = query_gpt_model(
+        folder_path=model_loc,
+        prompt_msg=query,
+        speaker=spkr,
+        responder=rspndr,
+        kparam=k_results,
+        temp=my_temp,
+        top_p=my_top_p,
+        verbose=want_verbose,
+        use_gpu=False,
+    )
+    output = resp["out_text"]
+    pp.pprint(output, indent=4)
+    # pp.pprint(this_result[3].strip(), indent=4)
+    rt = round(time.time() - st, 1)
+    if want_rt:
+        print("took {runtime} seconds to generate. \n".format(runtime=rt))
+    if want_verbose:
+        print("finished - ", datetime.now())
+    if want_verbose:
+        p_list = resp["full_conv"]
+        print("A transcript of your chat is as follows: \n")
+        p_list = [item.strip() for item in p_list]
+        pp.pprint(p_list)

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+deploy-as-bot\gradio_chatbot.py
+A system, method for deploying to Gradio. Gradio is a basic "deploy" interface which allows for other users to test your model from a web URL. It also enables some basic functionality like user flagging for weird responses.
+Note that the URL is displayed once the script is run.
+Set the working directory to */deploy-as-bot in terminal before running.
+"""
+import os
+import sys
+from os.path import dirname
+sys.path.append(dirname(dirname(os.path.abspath(__file__))))
+import gradio as gr
+import logging
+import argparse
+import time
+import warnings
+from pathlib import Path
+from cleantext import clean
+from transformers import pipeline
+from datetime import datetime
+from ai_single_response import query_gpt_model
+#from gradio.networking import get_state, set_state
+from flask import Flask, request, session, jsonify, abort, send_file, render_template, redirect
+import nltk
+nltk.download('stopwords')
+warnings.filterwarnings(action="ignore", message=".*gradient_checkpointing*")
+logging.basicConfig()
+cwd = Path.cwd()
+my_cwd = str(cwd.resolve())  # string so it can be passed to os.path() objects
+def gramformer_correct(corrector, qphrase: str):
+    """
+    gramformer_correct - correct a string using a text2textgen pipeline model from transformers
+    Args:
+        corrector (transformers.pipeline): [transformers pipeline object, already created w/ relevant model]
+        qphrase (str): [text to be corrected]
+    Returns:
+        [str]: [corrected text]
+    """
+    try:
+        corrected = corrector(
+            clean(qphrase), return_text=True, clean_up_tokenization_spaces=True
+        )
+        return corrected[0]["generated_text"]
+    except:
+        print("NOTE - failed to correct with gramformer")
+        return clean(qphrase)
+def ask_gpt(message: str, sender: str = ""):
+    """
+    ask_gpt - queries the relevant model with a prompt message and (optional) speaker name
+    Args:
+        message (str): prompt message to respond to
+        sender (str, optional): speaker aka who said the message. Defaults to "".
+    Returns:
+        [str]: [model response as a string]
+    """
+    st = time.time()
+    prompt = clean(message)  # clean user input
+    prompt = prompt.strip()  # get rid of any extra whitespace
+    if len(prompt) > 200:
+        prompt = prompt[-200:]  # truncate
+    sender = clean(sender.strip())
+    if len(sender) > 2:
+        try:
+            prompt_speaker = clean(sender)
+        except:
+            # there was some issue getting that info, whatever
+            prompt_speaker = None
+    else:
+        prompt_speaker = None
+    resp = query_gpt_model(
+        folder_path=model_loc,
+        prompt_msg=prompt,
+        speaker=prompt_speaker,
+        kparam=150,
+        temp=0.75,
+        top_p=0.65,  # optimize this with hyperparam search
+    )
+    bot_resp = gramformer_correct(corrector, qphrase=resp["out_text"])
+    rt = round(time.time() - st, 2)
+    print(f"took {rt} sec to respond")
+    return bot_resp
+def chat(first_and_last_name, message):
+    """
+    chat - helper function that makes the whole gradio thing work.
+    Args:
+        first_and_last_name (str or None): [speaker of the prompt, if provided]
+        message (str): [description]
+    Returns:
+        [str]: [returns an html string to display]
+    """
+    history = session.get("my_state") or []
+    response = ask_gpt(message, sender=first_and_last_name)
+    history.append((f"{first_and_last_name}: " + message, " GPT-Model: " + response)) #+ " [end] "))
+    session["my_state"] = history
+    session.modified = True
+    #html = "<div class='chatbot'>"
+    #for user_msg, resp_msg in history:
+    #    html += f"<div class='user_msg'>{user_msg}</div>"
+    #    html += f"<div class='resp_msg' style='color: black'>{resp_msg}</div>"
+    #html += "</div>"
+    return response
+def get_parser():
+    """
+    get_parser - a helper function for the argparse module
+    Returns:
+        [argparse.ArgumentParser]: [the argparser relevant for this script]
+    """
+    parser = argparse.ArgumentParser(
+        description="submit a message and have a 774M parameter GPT model respond"
+    )
+    parser.add_argument(
+        "--model",
+        required=False,
+        type=str,
+        # "gp2_DDandPeterTexts_774M_73Ksteps", - from GPT-Peter
+        default="GPT2_trivNatQAdailydia_774M_175Ksteps",
+        help="folder - with respect to git directory of your repo that has the model files in it (pytorch.bin + "
+        "config.json). No models? Run the script download_models.py",
+    )
+    parser.add_argument(
+        "--gram-model",
+        required=False,
+        type=str,
+        default="prithivida/grammar_error_correcter_v1",
+        help="text2text generation model ID from huggingface for the model to correct grammar",
+    )
+    return parser
+if __name__ == "__main__":
+    args = get_parser().parse_args()
+    default_model = str(args.model)
+    model_loc = cwd.parent / default_model
+    model_loc = str(model_loc.resolve())
+    gram_model = args.gram_model
+    print(f"using model stored here: \n {model_loc} \n")
+    corrector = pipeline("text2text-generation", model=gram_model, device=-1)
+    print("Finished loading the gramformer model - ", datetime.now())
+    iface = gr.Interface(
+        chat,
+        inputs=["text", "text"],
+        outputs="html",
+        title="Real-Impact English Chat Demo 英语聊天演示",
+        description="A basic interface with a neural network model trained on general Q&A and conversation. Treat it like a friend! 带有模型的基本界面，进行了一般问答和对话训练。 请像朋友一样与他对话！ \n first and last name 姓名 \n message 信息 \n Clear 清除 \nSubmit 确认 \n Screenshot 截屏",
+        article="**Important Notes & About: 重要说明 & 关于我们**\n"
+        "1. the model can take up to 200 seconds to respond sometimes, patience is a virtue. 该模型有时可能需要长达 60 秒的响应时间，请耐心等待。\n"
+        "2. entering a username is completely optional. 姓名输入是可选的。\n "
+        "3. the model was trained on several different datasets. Anything it says should be fact-checked before being regarded as a true statement. 该模型在几个不同的数据集上训练而成，它所说的任何内容都应该经过事实核查，然后才能被视为真实陈述。\n ",
+        css="""
+        .chatbox {display:flex;flex-direction:column}
+        .user_msg, .resp_msg {padding:4px;margin-bottom:4px;border-radius:4px;width:80%}
+        .user_msg {background-color:cornflowerblue;color:white;align-self:start}
+        .resp_msg {background-color:lightgray;align-self:self-end}
+    """,
+        allow_screenshot=True,
+        allow_flagging=False,
+        flagging_dir="gradio_data",
+        flagging_options=[
+            "great response",
+            "doesn't make sense",
+            "bad/offensive response",
+        ],
+        enable_queue=True,  # allows for dealing with multiple users simultaneously
+        #theme="darkhuggingface",
+        #server_name="0.0.0.0",
+    )
+    iface.launch(share=True)

config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_name_or_path": "/content/drive/MyDrive/Programming/AI_peter/gpt2_dailydialogue_gpu_355M",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "gradient_checkpointing": true,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "line_by_line": false,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "n_vocab": 50257,
+  "resid_pdrop": 0.1,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.11.3",
+  "use_cache": false,
+  "vocab_size": 50257
+}

file_test.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import os
2	+
3	+ print(os.path.exists("/Users/jonathan/ai-msgbot/gpt2_dailydialogue_355M_150Ksteps/pytorch_model.bin"))

requirements.txt ADDED Viewed

	@@ -0,0 +1,101 @@

+absl-py==1.0.0
+aiohttp==3.8.1
+aiosignal==1.2.0
+aitextgen==0.5.2
+analytics-python==1.4.0
+APScheduler==3.6.3
+async-timeout==4.0.2
+attrs==21.2.0
+backoff==1.10.0
+backports.zoneinfo==0.2.1
+bcrypt==3.2.0
+cachetools==4.2.2
+certifi==2021.10.8
+cffi==1.15.0
+chardet==3.0.4
+charset-normalizer==2.0.9
+cleantext==1.1.3
+click==8.0.3
+cryptography==36.0.1
+cycler==0.11.0
+editdistpy==0.1.3
+ffmpy==0.3.0
+filelock==3.4.2
+fire==0.4.0
+Flask==2.0.2
+Flask-CacheBuster==1.0.0
+Flask-Cors==3.0.10
+Flask-Login==0.5.0
+fonttools==4.28.5
+frozenlist==1.2.0
+fsspec==2021.11.1
+future==0.18.2
+google-auth==2.3.3
+google-auth-oauthlib==0.4.6
+gradio==2.4.6
+grpcio==1.43.0
+huggingface-hub==0.2.1
+idna==2.10
+importlib-metadata==4.10.0
+itsdangerous==2.0.1
+Jinja2==3.0.3
+joblib==1.1.0
+kiwisolver==1.3.2
+Markdown==3.3.6
+markdown2==2.4.2
+MarkupSafe==2.0.1
+matplotlib==3.5.1
+monotonic==1.6
+multidict==5.2.0
+natsort==7.1.1
+nltk==3.6.6
+numpy==1.21.5
+oauthlib==3.1.1
+openwa==1.3.16
+packaging==21.3
+pandas==1.3.5
+paramiko==2.9.1
+Pillow==8.4.0
+protobuf==3.19.1
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycparser==2.21
+pycryptodome==3.12.0
+pyDeprecate==0.3.1
+pydub==0.25.1
+PyNaCl==1.4.0
+pyparsing==3.0.6
+python-axolotl==0.2.3
+python-axolotl-curve25519==0.4.1.post2
+python-dateutil==2.8.2
+python-telegram-bot==13.8.1
+pytorch-lightning==1.5.7
+pytz==2021.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+regex==2021.11.10
+requests==2.24.0
+requests-oauthlib==1.3.0
+rsa==4.8
+sacremoses==0.0.46
+selenium==3.141.0
+six==1.16.0
+symspellpy==6.7.6
+tensorboard==2.7.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.0
+termcolor==1.1.0
+tokenizers==0.10.3
+torch==1.10.1
+torchmetrics==0.6.2
+tornado==6.1
+tqdm==4.43.0
+transformers==4.12.5
+typing_extensions==4.0.1
+tzdata==2021.5
+tzlocal==4.1
+urllib3==1.25.11
+webwhatsapi==2.0.5
+Werkzeug==2.0.2
+yarl==1.7.2
+zipp==3.6.0

utils.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+general utility functions for loading, saving, etc
+"""
+import os
+from pathlib import Path
+import pprint as pp
+import re
+import shutil  # zipfile formats
+from datetime import datetime
+from os.path import basename
+from os.path import getsize, join
+import requests
+from cleantext import clean
+from natsort import natsorted
+from symspellpy import SymSpell
+import pandas as pd
+from tqdm.auto import tqdm
+def get_timestamp():
+    return datetime.now().strftime("%b-%d-%Y_t-%H")
+def correct_phrase_load(my_string: str):
+    """
+    correct_phrase_load [basic / unoptimized implementation of SymSpell to correct a string]
+    Args:
+        my_string (str): [text to be corrected]
+    Returns:
+        [type]: [description]
+    """
+    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
+    dictionary_path = (
+        r"symspell_rsc/frequency_dictionary_en_82_765.txt"  # from repo root
+    )
+    bigram_path = (
+        r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt"  # from repo root
+    )
+    # term_index is the column of the term and count_index is the
+    # column of the term frequency
+    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
+    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
+    # max edit distance per lookup (per single word, not per whole input string)
+    suggestions = sym_spell.lookup_compound(
+        clean(my_string), max_edit_distance=2, ignore_non_words=True
+    )
+    if len(suggestions) < 1:
+        return my_string
+    else:
+        first_result = suggestions[0]
+        return first_result._term
+def fast_scandir(dirname: str):
+    """
+    fast_scandir [an os.path-based means to return all subfolders in a given filepath]
+    Args:
+        dirname (str): [description]
+    Returns:
+        [list]: [description]
+    """
+    subfolders = [f.path for f in os.scandir(dirname) if f.is_dir()]
+    for dirname in list(subfolders):
+        subfolders.extend(fast_scandir(dirname))
+    return subfolders  # list
+def create_folder(directory: str):
+    os.makedirs(directory, exist_ok=True)
+def chunks(lst: list, n: int):
+    """
+    chunks   -  Yield successive n-sized chunks from lst
+    Args:
+        lst (list): [description]
+        n (int): [description]
+    Yields:
+        [type]: [description]
+    """
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+def chunky_pandas(my_df, num_chunks: int = 4):
+    """
+    chunky_pandas [split dataframe into `num_chunks` equal chunks, return each inside a list]
+    Args:
+        my_df (pd.DataFrame): [description]
+        num_chunks (int, optional): [description]. Defaults to 4.
+    Returns:
+        [type]: [description]
+    """
+    n = int(len(my_df) // num_chunks)
+    list_df = [my_df[i : i + n] for i in range(0, my_df.shape[0], n)]
+    return list_df
+def load_dir_files(
+    directory: str, req_extension=".txt", return_type="list", verbose=False
+):
+    """
+    load_dir_files - an os.path based method of returning all files with extension `req_extension` in a given directory and subdirectories
+    Args:
+        directory (str): [description]
+        req_extension (str, optional): [description]. Defaults to ".txt".
+        return_type (str, optional): [description]. Defaults to "list".
+        verbose (bool, optional): [description]. Defaults to False.
+    Returns:
+        [type]: [description]
+    """
+    appr_files = []
+    # r=root, d=directories, f = files
+    for r, d, f in os.walk(directory):
+        for prefile in f:
+            if prefile.endswith(req_extension):
+                fullpath = os.path.join(r, prefile)
+                appr_files.append(fullpath)
+    appr_files = natsorted(appr_files)
+    if verbose:
+        print("A list of files in the {} directory are: \n".format(directory))
+        if len(appr_files) < 10:
+            pp.pprint(appr_files)
+        else:
+            pp.pprint(appr_files[:10])
+            print("\n and more. There are a total of {} files".format(len(appr_files)))
+    if return_type.lower() == "list":
+        return appr_files
+    else:
+        if verbose:
+            print("returning dictionary")
+        appr_file_dict = {}
+        for this_file in appr_files:
+            appr_file_dict[basename(this_file)] = this_file
+        return appr_file_dict
+def URL_string_filter(text):
+    """
+    URL_string_filter - filter out nonstandard "text" characters
+    Args:
+        text ([type]): [description]
+    Returns:
+        [str]: [description]
+    """
+    custom_printable = (
+        "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._"
+    )
+    filtered = "".join((filter(lambda i: i in custom_printable, text)))
+    return filtered
+def getFilename_fromCd(cd):
+    if not cd:
+        return None
+    fname = re.findall("filename=(.+)", cd)
+    if len(fname) > 0:
+        output = fname[0]
+    elif cd.find("/"):
+        possible_fname = cd.rsplit("/", 1)[1]
+        output = URL_string_filter(possible_fname)
+    else:
+        output = None
+    return output
+def get_zip_URL(
+    URLtoget: str,
+    extract_loc: str = None,
+    file_header: str = "dropboxexport_",
+    verbose: bool = False,
+):
+    """
+    get_zip_URL [summary]
+    Args:
+        URLtoget (str): [description]
+        extract_loc (str, optional): [description]. Defaults to None.
+        file_header (str, optional): [description]. Defaults to "dropboxexport_".
+        verbose (bool, optional): [description]. Defaults to False.
+    Returns:
+        [type]: [description]
+    """
+    r = requests.get(URLtoget, allow_redirects=True)
+    names = getFilename_fromCd(r.headers.get("content-disposition"))
+    fixed_fnames = names.split(";")  # split the multiple results
+    this_filename = file_header + URL_string_filter(fixed_fnames[0])
+    # define paths and save the zip file
+    if extract_loc is None:
+        extract_loc = "dropbox_dl"
+    dl_place = join(os.getcwd(), extract_loc)
+    create_folder(dl_place)
+    save_loc = join(os.getcwd(), this_filename)
+    open(save_loc, "wb").write(r.content)
+    if verbose:
+        print("downloaded file size was {} MB".format(getsize(save_loc) / 1000000))
+    # unpack the archive
+    shutil.unpack_archive(save_loc, extract_dir=dl_place)
+    if verbose:
+        print("extracted zip file - ", datetime.now())
+        x = load_dir_files(dl_place, req_extension="", verbose=verbose)
+    # remove original
+    try:
+        os.remove(save_loc)
+        del save_loc
+    except:
+        print("unable to delete original zipfile - check if exists", datetime.now())
+    print("finished extracting zip - ", datetime.now())
+    return dl_place
+def merge_dataframes(data_dir: str, ext=".xlsx", verbose=False):
+    """
+    merge_dataframes - given a filepath, loads and attempts to merge all files as dataframes
+    Args:
+        data_dir (str): [root directory to search in]
+        ext (str, optional): [anticipate file extension for the dataframes ]. Defaults to '.xlsx'.
+    Returns:
+        pd.DataFrame(): merged dataframe
+    """
+    src = Path(data_dir)
+    src_str = str(src.resolve())
+    mrg_df = pd.DataFrame()
+    all_reports = load_dir_files(directory=src_str, req_extension=ext, verbose=verbose)
+    failed = []
+    for df_path in tqdm(all_reports, total=len(all_reports), desc="joining data..."):
+        try:
+            this_df = pd.read_excel(df_path).convert_dtypes()
+            mrg_df = pd.concat([mrg_df, this_df], axis=0)
+        except:
+            short_p = os.path.basename(df_path)
+            print(
+                f"WARNING - file with extension {ext} and name {short_p} could not be read."
+            )
+            failed.append(short_p)
+    if len(failed) > 0:
+        print("failed to merge {} files, investigate as needed")
+    if verbose:
+        pp.pprint(mrg_df.info(True))
+    return mrg_df