In [12]:
import pandas as pd

df = pd.read_csv('file_db/browse_conditions.txt', delimiter='|')  # Use the appropriate delimiter if not tab-separated

df.head()

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,336369685,NCT04016870,Infections,infections,mesh-ancestor
1,336369788,NCT03266874,Necrosis,necrosis,mesh-list
2,336369897,NCT02743455,Fever,fever,mesh-list
3,336370004,NCT01683877,Neoplasms,neoplasms,mesh-ancestor
4,336370095,NCT01268579,Carcinoma,carcinoma,mesh-list


In [13]:
files_to_keep = ["brief_summaries", "interventions", "keywords", "browse_conditions"]

# maybe "study_references" "sponsors" "overall_officials" "pending_results" "outcome_analyses" "provided_documents" "reported_event_totals" "responsible_parties"



In [14]:
df_summary = pd.read_csv('file_db/brief_summaries.txt', delimiter='|')
df_summary = df_summary.rename(columns={'description': 'summary'})

### create and merge intervention ###
df_intervention = pd.read_csv('file_db/interventions.txt', delimiter='|')

intervention_grouped = df_intervention.groupby('nct_id')['name'].apply(list).reset_index()
intervention_grouped = intervention_grouped.rename(columns={'name': 'intervention_name'})
merged_df = pd.merge(
    df_summary[['nct_id', 'summary']], 
    intervention_grouped[['nct_id', 'intervention_name']], 
    on='nct_id')

df_intervention = df_intervention.rename(columns={'description': 'intervention_description'})

merged_df = pd.merge(
    merged_df,
    df_intervention[['nct_id', 'intervention_type', 'intervention_description']], 
    on='nct_id')

### create and merge keywords ###
df_keyword = pd.read_csv('file_db/keywords.txt', delimiter='|')
keywords_grouped = df_keyword.groupby('nct_id')['name'].apply(list).reset_index()
keywords_grouped = keywords_grouped.rename(columns={'name': 'keywords'})

merged_df = pd.merge(
    merged_df,
    keywords_grouped,
    on='nct_id'
)

### create and merge browse conditions
df_condition = pd.read_csv('file_db/browse_conditions.txt', delimiter='|')
conditions_grouped = df_condition.groupby('nct_id')['downcase_mesh_term'].apply(list).reset_index()
conditions_grouped = conditions_grouped.rename(columns={'downcase_mesh_term': 'desease_condition'})

merged_df = pd.merge(
    merged_df,
    conditions_grouped,
    on='nct_id'
)

merged_df = merged_df.drop_duplicates(subset='nct_id')

merged_df.head()



Unnamed: 0,nct_id,summary,intervention_name,intervention_type,intervention_description,keywords,desease_condition
0,NCT03569293,The objective of this study is to assess the e...,"[Placebo for Upadacitinib, Upadacitinib]",Drug,Tablets taken orally once a day,"[Atopic Dermatitis, Upadacitinib]","[dermatitis, atopic, dermatitis, eczema, skin ..."
2,NCT03556839,The study will integrate the efficacy of combi...,"[Atezolizumab, Bevacizumab, Cisplatin/Carbopla...",Drug,Intravenous Infusion,"[Cervix, Carcinoma, Atezolizumab]","[carcinoma, neoplasms, glandular and epithelia..."
6,NCT03526874,Migraine affects 10-28% of children and adoles...,[Lidocaine 4% Topical Application Cream [LMX 4...,Drug,Run-in Step: All subjects receive 32 mg (4 cm ...,"[Episodic Migraine, Headache, Nerve Block, Pai...","[pain, migraine disorders, headache, headache ..."
9,NCT03526835,"This is a Phase 1/2 open-label, multi-center, ...","[MCLA-158, MCLA-158 +Pembrolizumab]",Drug,full-length IgG1 bispecific antibody targeting...,"[Bispecific antibody, First-in-human, MCLA-158...","[squamous cell carcinoma of head and neck, neo..."
11,NCT02272751,This study will aim to compare the effects of ...,"[Exercise, Relaxation]",Behavioral,The Exercise intervention will consist of aero...,"[cancer survivorship, exercise, relaxation, mi...","[lymphoma, neoplasms by histologic type, neopl..."


In [15]:
# Concatenate all columns into one written text
merged_df['text'] = merged_df.drop(columns=['desease_condition']).apply(lambda row: '\n'.join([f"{col}: {val}" for col, val in row.items()]), axis=1)

# Save the DataFrame to a new CSV file
merged_df = merged_df[['desease_condition', 'text']]
merged_df.to_csv('clinical_trials.csv', index=False)

merged_df.head()

Unnamed: 0,desease_condition,text
0,"[dermatitis, atopic, dermatitis, eczema, skin ...",nct_id: NCT03569293\nsummary: The objective of...
2,"[carcinoma, neoplasms, glandular and epithelia...",nct_id: NCT03556839\nsummary: The study will i...
6,"[pain, migraine disorders, headache, headache ...",nct_id: NCT03526874\nsummary: Migraine affects...
9,"[squamous cell carcinoma of head and neck, neo...",nct_id: NCT03526835\nsummary: This is a Phase ...
11,"[lymphoma, neoplasms by histologic type, neopl...",nct_id: NCT02272751\nsummary: This study will ...
