Spaces:

klinic-hackupc
/

klinic

Sleeping

File size: 15,719 Bytes

93e1b64

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>nct_id</th>\n",
       "      <th>mesh_term</th>\n",
       "      <th>downcase_mesh_term</th>\n",
       "      <th>mesh_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>336369685</td>\n",
       "      <td>NCT04016870</td>\n",
       "      <td>Infections</td>\n",
       "      <td>infections</td>\n",
       "      <td>mesh-ancestor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>336369788</td>\n",
       "      <td>NCT03266874</td>\n",
       "      <td>Necrosis</td>\n",
       "      <td>necrosis</td>\n",
       "      <td>mesh-list</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>336369897</td>\n",
       "      <td>NCT02743455</td>\n",
       "      <td>Fever</td>\n",
       "      <td>fever</td>\n",
       "      <td>mesh-list</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>336370004</td>\n",
       "      <td>NCT01683877</td>\n",
       "      <td>Neoplasms</td>\n",
       "      <td>neoplasms</td>\n",
       "      <td>mesh-ancestor</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>336370095</td>\n",
       "      <td>NCT01268579</td>\n",
       "      <td>Carcinoma</td>\n",
       "      <td>carcinoma</td>\n",
       "      <td>mesh-list</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          id       nct_id   mesh_term downcase_mesh_term      mesh_type\n",
       "0  336369685  NCT04016870  Infections         infections  mesh-ancestor\n",
       "1  336369788  NCT03266874    Necrosis           necrosis      mesh-list\n",
       "2  336369897  NCT02743455       Fever              fever      mesh-list\n",
       "3  336370004  NCT01683877   Neoplasms          neoplasms  mesh-ancestor\n",
       "4  336370095  NCT01268579   Carcinoma          carcinoma      mesh-list"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('file_db/browse_conditions.txt', delimiter='|')  # Use the appropriate delimiter if not tab-separated\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "files_to_keep = [\"brief_summaries\", \"interventions\", \"keywords\", \"browse_conditions\"]\n",
    "\n",
    "# maybe \"study_references\" \"sponsors\" \"overall_officials\" \"pending_results\" \"outcome_analyses\" \"provided_documents\" \"reported_event_totals\" \"responsible_parties\"\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nct_id</th>\n",
       "      <th>summary</th>\n",
       "      <th>intervention_name</th>\n",
       "      <th>intervention_type</th>\n",
       "      <th>intervention_description</th>\n",
       "      <th>keywords</th>\n",
       "      <th>desease_condition</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NCT03569293</td>\n",
       "      <td>The objective of this study is to assess the e...</td>\n",
       "      <td>[Placebo for Upadacitinib, Upadacitinib]</td>\n",
       "      <td>Drug</td>\n",
       "      <td>Tablets taken orally once a day</td>\n",
       "      <td>[Atopic Dermatitis, Upadacitinib]</td>\n",
       "      <td>[dermatitis, atopic, dermatitis, eczema, skin ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NCT03556839</td>\n",
       "      <td>The study will integrate the efficacy of combi...</td>\n",
       "      <td>[Atezolizumab, Bevacizumab, Cisplatin/Carbopla...</td>\n",
       "      <td>Drug</td>\n",
       "      <td>Intravenous Infusion</td>\n",
       "      <td>[Cervix, Carcinoma, Atezolizumab]</td>\n",
       "      <td>[carcinoma, neoplasms, glandular and epithelia...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>NCT03526874</td>\n",
       "      <td>Migraine affects 10-28% of children and adoles...</td>\n",
       "      <td>[Lidocaine 4% Topical Application Cream [LMX 4...</td>\n",
       "      <td>Drug</td>\n",
       "      <td>Run-in Step: All subjects receive 32 mg (4 cm ...</td>\n",
       "      <td>[Episodic Migraine, Headache, Nerve Block, Pai...</td>\n",
       "      <td>[pain, migraine disorders, headache, headache ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>NCT03526835</td>\n",
       "      <td>This is a Phase 1/2 open-label, multi-center, ...</td>\n",
       "      <td>[MCLA-158, MCLA-158 +Pembrolizumab]</td>\n",
       "      <td>Drug</td>\n",
       "      <td>full-length IgG1 bispecific antibody targeting...</td>\n",
       "      <td>[Bispecific antibody, First-in-human, MCLA-158...</td>\n",
       "      <td>[squamous cell carcinoma of head and neck, neo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>NCT02272751</td>\n",
       "      <td>This study will aim to compare the effects of ...</td>\n",
       "      <td>[Exercise, Relaxation]</td>\n",
       "      <td>Behavioral</td>\n",
       "      <td>The Exercise intervention will consist of aero...</td>\n",
       "      <td>[cancer survivorship, exercise, relaxation, mi...</td>\n",
       "      <td>[lymphoma, neoplasms by histologic type, neopl...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         nct_id                                            summary  \\\n",
       "0   NCT03569293  The objective of this study is to assess the e...   \n",
       "2   NCT03556839  The study will integrate the efficacy of combi...   \n",
       "6   NCT03526874  Migraine affects 10-28% of children and adoles...   \n",
       "9   NCT03526835  This is a Phase 1/2 open-label, multi-center, ...   \n",
       "11  NCT02272751  This study will aim to compare the effects of ...   \n",
       "\n",
       "                                    intervention_name intervention_type  \\\n",
       "0            [Placebo for Upadacitinib, Upadacitinib]              Drug   \n",
       "2   [Atezolizumab, Bevacizumab, Cisplatin/Carbopla...              Drug   \n",
       "6   [Lidocaine 4% Topical Application Cream [LMX 4...              Drug   \n",
       "9                 [MCLA-158, MCLA-158 +Pembrolizumab]              Drug   \n",
       "11                             [Exercise, Relaxation]        Behavioral   \n",
       "\n",
       "                             intervention_description  \\\n",
       "0                     Tablets taken orally once a day   \n",
       "2                                Intravenous Infusion   \n",
       "6   Run-in Step: All subjects receive 32 mg (4 cm ...   \n",
       "9   full-length IgG1 bispecific antibody targeting...   \n",
       "11  The Exercise intervention will consist of aero...   \n",
       "\n",
       "                                             keywords  \\\n",
       "0                   [Atopic Dermatitis, Upadacitinib]   \n",
       "2                   [Cervix, Carcinoma, Atezolizumab]   \n",
       "6   [Episodic Migraine, Headache, Nerve Block, Pai...   \n",
       "9   [Bispecific antibody, First-in-human, MCLA-158...   \n",
       "11  [cancer survivorship, exercise, relaxation, mi...   \n",
       "\n",
       "                                    desease_condition  \n",
       "0   [dermatitis, atopic, dermatitis, eczema, skin ...  \n",
       "2   [carcinoma, neoplasms, glandular and epithelia...  \n",
       "6   [pain, migraine disorders, headache, headache ...  \n",
       "9   [squamous cell carcinoma of head and neck, neo...  \n",
       "11  [lymphoma, neoplasms by histologic type, neopl...  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_summary = pd.read_csv('file_db/brief_summaries.txt', delimiter='|')\n",
    "df_summary = df_summary.rename(columns={'description': 'summary'})\n",
    "\n",
    "### create and merge intervention ###\n",
    "df_intervention = pd.read_csv('file_db/interventions.txt', delimiter='|')\n",
    "\n",
    "intervention_grouped = df_intervention.groupby('nct_id')['name'].apply(list).reset_index()\n",
    "intervention_grouped = intervention_grouped.rename(columns={'name': 'intervention_name'})\n",
    "merged_df = pd.merge(\n",
    "    df_summary[['nct_id', 'summary']], \n",
    "    intervention_grouped[['nct_id', 'intervention_name']], \n",
    "    on='nct_id')\n",
    "\n",
    "df_intervention = df_intervention.rename(columns={'description': 'intervention_description'})\n",
    "\n",
    "merged_df = pd.merge(\n",
    "    merged_df,\n",
    "    df_intervention[['nct_id', 'intervention_type', 'intervention_description']], \n",
    "    on='nct_id')\n",
    "\n",
    "### create and merge keywords ###\n",
    "df_keyword = pd.read_csv('file_db/keywords.txt', delimiter='|')\n",
    "keywords_grouped = df_keyword.groupby('nct_id')['name'].apply(list).reset_index()\n",
    "keywords_grouped = keywords_grouped.rename(columns={'name': 'keywords'})\n",
    "\n",
    "merged_df = pd.merge(\n",
    "    merged_df,\n",
    "    keywords_grouped,\n",
    "    on='nct_id'\n",
    ")\n",
    "\n",
    "### create and merge browse conditions\n",
    "df_condition = pd.read_csv('file_db/browse_conditions.txt', delimiter='|')\n",
    "conditions_grouped = df_condition.groupby('nct_id')['downcase_mesh_term'].apply(list).reset_index()\n",
    "conditions_grouped = conditions_grouped.rename(columns={'downcase_mesh_term': 'desease_condition'})\n",
    "\n",
    "merged_df = pd.merge(\n",
    "    merged_df,\n",
    "    conditions_grouped,\n",
    "    on='nct_id'\n",
    ")\n",
    "\n",
    "merged_df = merged_df.drop_duplicates(subset='nct_id')\n",
    "\n",
    "merged_df.head()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>desease_condition</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[dermatitis, atopic, dermatitis, eczema, skin ...</td>\n",
       "      <td>nct_id: NCT03569293\\nsummary: The objective of...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[carcinoma, neoplasms, glandular and epithelia...</td>\n",
       "      <td>nct_id: NCT03556839\\nsummary: The study will i...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>[pain, migraine disorders, headache, headache ...</td>\n",
       "      <td>nct_id: NCT03526874\\nsummary: Migraine affects...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>[squamous cell carcinoma of head and neck, neo...</td>\n",
       "      <td>nct_id: NCT03526835\\nsummary: This is a Phase ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>[lymphoma, neoplasms by histologic type, neopl...</td>\n",
       "      <td>nct_id: NCT02272751\\nsummary: This study will ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    desease_condition  \\\n",
       "0   [dermatitis, atopic, dermatitis, eczema, skin ...   \n",
       "2   [carcinoma, neoplasms, glandular and epithelia...   \n",
       "6   [pain, migraine disorders, headache, headache ...   \n",
       "9   [squamous cell carcinoma of head and neck, neo...   \n",
       "11  [lymphoma, neoplasms by histologic type, neopl...   \n",
       "\n",
       "                                                 text  \n",
       "0   nct_id: NCT03569293\\nsummary: The objective of...  \n",
       "2   nct_id: NCT03556839\\nsummary: The study will i...  \n",
       "6   nct_id: NCT03526874\\nsummary: Migraine affects...  \n",
       "9   nct_id: NCT03526835\\nsummary: This is a Phase ...  \n",
       "11  nct_id: NCT02272751\\nsummary: This study will ...  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Concatenate all columns into one written text\n",
    "merged_df['text'] = merged_df.drop(columns=['desease_condition']).apply(lambda row: '\\n'.join([f\"{col}: {val}\" for col, val in row.items()]), axis=1)\n",
    "\n",
    "# Save the DataFrame to a new CSV file\n",
    "merged_df = merged_df[['desease_condition', 'text']]\n",
    "merged_df.to_csv('clinical_trials.csv', index=False)\n",
    "\n",
    "merged_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}