{ "cells": [ { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import os,re,zipfile\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "file1=\"/home/itai/Downloads/Spec - Manually file prepared by Bioforum PM (1).xlsx\"\n", "file2=\"/home/itai/Downloads/ALS - Export of metadata from Rave.xlsx\"\n", "file3=\"/home/itai/Downloads/Database Validation Tracker_General Checks_v10 (1).xlsm\"" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "def fetch_sheet_names(path):\n", " sheets=[]\n", " with zipfile.ZipFile(path,'r') as zip_ref:\n", " xml=zip_ref.read(\"xl/workbook.xml\").decode(\"utf-8\")\n", " for s_tag in re.findall(\"]*\",xml) : sheets.append(re.search('name=\"[^\"]*',s_tag).group(0)[6:])\n", "\n", " return sheets" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['CRFDraft',\n", " 'Forms',\n", " 'Fields',\n", " 'Folders',\n", " 'DataDictionaries',\n", " 'DataDictionaryEntries',\n", " 'UnitDictionaries',\n", " 'UnitDictionaryEntries',\n", " 'Matrices',\n", " 'Matrix1#1YEAR',\n", " 'Matrix2#30DAY',\n", " 'Matrix3#90DAY',\n", " 'Matrix4#90DAYREPEAT',\n", " 'Matrix5#BASELINEDOV',\n", " 'Matrix6#COMPLETE',\n", " 'Matrix7#COMPLETEUS',\n", " 'Matrix8#DISCHARGE',\n", " 'Matrix9#DMREGISTRY',\n", " 'Matrix10#ENROLLED',\n", " 'Matrix11#INDEX',\n", " 'Matrix12#LBCHEMISTRY',\n", " 'Matrix13#POSTOPICU',\n", " 'Matrix14#PRIMARY',\n", " 'Matrix15#REGISTRY',\n", " 'Matrix16#REGISTRYFU',\n", " 'Matrix17#UNIQUE',\n", " 'Matrix18#UNS',\n", " 'Checks',\n", " 'CheckSteps',\n", " 'CheckActions',\n", " 'Derivations',\n", " 'DerivationSteps',\n", " 'LabVariableMappings',\n", " 'CustomFunctions',\n", " 'EmailAlerts',\n", " 'CoderConfiguration',\n", " 'CoderSupplementalTerms',\n", " 'CoderComponentTerms']" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fetch_sheet_names(file2)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "def process_excel_files(files):\n", " temp_list=[(file,fetch_sheet_names(file)) for file in files]\n", " df = [pd.read_excel(tup[0],sheet) for tup in temp_list for sheet in tup[1]]\n", " return df\n", "\n", "def fetch_sheet_names(path):\n", " sheets=[]\n", " with zipfile.ZipFile(path,'r') as zip_ref:\n", " xml=zip_ref.read(\"xl/workbook.xml\").decode(\"utf-8\")\n", " for s_tag in re.findall(\"]*\",xml) : sheets.append(re.search('name=\"[^\"]*',s_tag).group(0)[6:])\n", "\n", " return sheets" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "ename": "BadZipFile", "evalue": "File is not a zip file", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mBadZipFile\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[44], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprocess_excel_files\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfile2\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[25], line 2\u001b[0m, in \u001b[0;36mprocess_excel_files\u001b[0;34m(files)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess_excel_files\u001b[39m(files):\n\u001b[0;32m----> 2\u001b[0m temp_list\u001b[38;5;241m=\u001b[39m\u001b[43m[\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43mfetch_sheet_names\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfiles\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 3\u001b[0m df \u001b[38;5;241m=\u001b[39m [pd\u001b[38;5;241m.\u001b[39mread_excel(tup[\u001b[38;5;241m0\u001b[39m],sheet) \u001b[38;5;28;01mfor\u001b[39;00m tup \u001b[38;5;129;01min\u001b[39;00m temp_list \u001b[38;5;28;01mfor\u001b[39;00m sheet \u001b[38;5;129;01min\u001b[39;00m tup[\u001b[38;5;241m1\u001b[39m]]\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df\n", "Cell \u001b[0;32mIn[25], line 2\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess_excel_files\u001b[39m(files):\n\u001b[0;32m----> 2\u001b[0m temp_list\u001b[38;5;241m=\u001b[39m[(file,\u001b[43mfetch_sheet_names\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files]\n\u001b[1;32m 3\u001b[0m df \u001b[38;5;241m=\u001b[39m [pd\u001b[38;5;241m.\u001b[39mread_excel(tup[\u001b[38;5;241m0\u001b[39m],sheet) \u001b[38;5;28;01mfor\u001b[39;00m tup \u001b[38;5;129;01min\u001b[39;00m temp_list \u001b[38;5;28;01mfor\u001b[39;00m sheet \u001b[38;5;129;01min\u001b[39;00m tup[\u001b[38;5;241m1\u001b[39m]]\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df\n", "Cell \u001b[0;32mIn[25], line 8\u001b[0m, in \u001b[0;36mfetch_sheet_names\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfetch_sheet_names\u001b[39m(path):\n\u001b[1;32m 7\u001b[0m sheets\u001b[38;5;241m=\u001b[39m[]\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mzipfile\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mZipFile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m zip_ref:\n\u001b[1;32m 9\u001b[0m xml\u001b[38;5;241m=\u001b[39mzip_ref\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxl/workbook.xml\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m s_tag \u001b[38;5;129;01min\u001b[39;00m re\u001b[38;5;241m.\u001b[39mfindall(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m]*\u001b[39m\u001b[38;5;124m\"\u001b[39m,xml) : sheets\u001b[38;5;241m.\u001b[39mappend(re\u001b[38;5;241m.\u001b[39msearch(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[^\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m]*\u001b[39m\u001b[38;5;124m'\u001b[39m,s_tag)\u001b[38;5;241m.\u001b[39mgroup(\u001b[38;5;241m0\u001b[39m)[\u001b[38;5;241m6\u001b[39m:])\n", "File \u001b[0;32m~/.pyenv/versions/3.11.7/lib/python3.11/zipfile.py:1302\u001b[0m, in \u001b[0;36mZipFile.__init__\u001b[0;34m(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps, metadata_encoding)\u001b[0m\n\u001b[1;32m 1300\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1301\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m-> 1302\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_RealGetContents\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1303\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m mode \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 1304\u001b[0m \u001b[38;5;66;03m# set the modified flag so central directory gets written\u001b[39;00m\n\u001b[1;32m 1305\u001b[0m \u001b[38;5;66;03m# even if no files are added to the archive\u001b[39;00m\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_didModify \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", "File \u001b[0;32m~/.pyenv/versions/3.11.7/lib/python3.11/zipfile.py:1369\u001b[0m, in \u001b[0;36mZipFile._RealGetContents\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1367\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m BadZipFile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile is not a zip file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1368\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m endrec:\n\u001b[0;32m-> 1369\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m BadZipFile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile is not a zip file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1370\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdebug \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 1371\u001b[0m \u001b[38;5;28mprint\u001b[39m(endrec)\n", "\u001b[0;31mBadZipFile\u001b[0m: File is not a zip file" ] } ], "source": [ "process_excel_files([file2])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "ename": "BadZipFile", "evalue": "File is not a zip file", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mBadZipFile\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[45], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfetch_sheet_names\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile2\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[25], line 8\u001b[0m, in \u001b[0;36mfetch_sheet_names\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfetch_sheet_names\u001b[39m(path):\n\u001b[1;32m 7\u001b[0m sheets\u001b[38;5;241m=\u001b[39m[]\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mzipfile\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mZipFile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m zip_ref:\n\u001b[1;32m 9\u001b[0m xml\u001b[38;5;241m=\u001b[39mzip_ref\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxl/workbook.xml\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m s_tag \u001b[38;5;129;01min\u001b[39;00m re\u001b[38;5;241m.\u001b[39mfindall(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m]*\u001b[39m\u001b[38;5;124m\"\u001b[39m,xml) : sheets\u001b[38;5;241m.\u001b[39mappend(re\u001b[38;5;241m.\u001b[39msearch(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[^\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m]*\u001b[39m\u001b[38;5;124m'\u001b[39m,s_tag)\u001b[38;5;241m.\u001b[39mgroup(\u001b[38;5;241m0\u001b[39m)[\u001b[38;5;241m6\u001b[39m:])\n", "File \u001b[0;32m~/.pyenv/versions/3.11.7/lib/python3.11/zipfile.py:1302\u001b[0m, in \u001b[0;36mZipFile.__init__\u001b[0;34m(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps, metadata_encoding)\u001b[0m\n\u001b[1;32m 1300\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1301\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m-> 1302\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_RealGetContents\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1303\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m mode \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 1304\u001b[0m \u001b[38;5;66;03m# set the modified flag so central directory gets written\u001b[39;00m\n\u001b[1;32m 1305\u001b[0m \u001b[38;5;66;03m# even if no files are added to the archive\u001b[39;00m\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_didModify \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", "File \u001b[0;32m~/.pyenv/versions/3.11.7/lib/python3.11/zipfile.py:1369\u001b[0m, in \u001b[0;36mZipFile._RealGetContents\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1367\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m BadZipFile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile is not a zip file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1368\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m endrec:\n\u001b[0;32m-> 1369\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m BadZipFile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile is not a zip file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1370\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdebug \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 1371\u001b[0m \u001b[38;5;28mprint\u001b[39m(endrec)\n", "\u001b[0;31mBadZipFile\u001b[0m: File is not a zip file" ] } ], "source": [ "fetch_sheet_names(file2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "gradio", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }