Spaces:
Sleeping
Sleeping
import json | |
from langchain_community.document_loaders.csv_loader import CSVLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import pandas as pd | |
import langchain | |
import os | |
import openai | |
import ast | |
from langchain import OpenAI | |
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import JSONLoader | |
from langchain.document_loaders import UnstructuredURLLoader | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_core.pydantic_v1 import BaseModel, Field | |
from langchain_openai import ChatOpenAI | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_core.pydantic_v1 import BaseModel, Field | |
from langchain_openai import ChatOpenAI | |
from typing import List, Dict, Any | |
import requests | |
from dotenv import load_dotenv | |
load_dotenv() | |
# getting the json files | |
def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]: | |
# Request: | |
# curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT00841061" \ | |
# -H "accept: text/csv" | |
request_url = f"https://clinicaltrials.gov/api/v2/studies/{clinical_record_id}" | |
response = requests.get(request_url, headers={"accept": "application/json"}) | |
return response.json() | |
def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]: | |
clinical_records = [] | |
for clinical_record_id in clinical_record_ids: | |
clinical_record_info = get_clinical_record_info(clinical_record_id) | |
clinical_records.append(clinical_record_info) | |
return clinical_records | |
def process_json_data_for_llm(data): | |
# Define the fields you want to keep | |
fields_to_keep = [ | |
"class_of_organization", | |
"title", | |
"overallStatus", | |
"descriptionModule", | |
"conditions", | |
"interventions", | |
"outcomesModule", | |
"eligibilityModule", | |
] | |
# Iterate through the dictionary and keep only the desired fields | |
filtered_data = [] | |
for item in data: | |
try: | |
organization_name = item["protocolSection"]["identificationModule"][ | |
"organization" | |
]["fullName"] | |
except: | |
organization_name = "" | |
try: | |
project_title = item["protocolSection"]["identificationModule"][ | |
"officialTitle" | |
] | |
except: | |
project_title = "" | |
try: | |
status = item["protocolSection"]["statusModule"]["overallStatus"] | |
except: | |
status = "" | |
try: | |
brief_description = item["protocolSection"]["descriptionModule"][ | |
"briefSummary" | |
] | |
except: | |
brief_description = "" | |
try: | |
detailed_description = item["protocolSection"]["descriptionModule"][ | |
"detailedDescription" | |
] | |
except: | |
detailed_description = "" | |
try: | |
conditions = item["protocolSection"]["conditionsModule"]["conditions"] | |
except: | |
conditions = [] | |
try: | |
keywords = item["protocolSection"]["conditionsModule"]["keywords"] | |
except: | |
keywords = [] | |
try: | |
interventions = item["protocolSection"]["armsInterventionsModule"][ | |
"interventions" | |
] | |
except: | |
interventions = [] | |
try: | |
primary_outcomes = item["protocolSection"]["outcomesModule"][ | |
"primaryOutcomes" | |
] | |
except: | |
primary_outcomes = [] | |
try: | |
secondary_outcomes = item["protocolSection"]["outcomesModule"][ | |
"secondaryOutcomes" | |
] | |
except: | |
secondary_outcomes = [] | |
try: | |
eligibility = item["protocolSection"]["eligibilityModule"] | |
except: | |
eligibility = {} | |
filtered_item = { | |
"organization_name": organization_name, | |
"project_title": project_title, | |
"status": status, | |
"brief_description": brief_description, | |
"detailed_description": detailed_description, | |
"keywords": keywords, | |
"interventions": interventions, | |
"primary_outcomes": primary_outcomes, | |
"secondary_outcomes": secondary_outcomes, | |
"eligibility": eligibility, | |
} | |
filtered_data.append(filtered_item) | |
# for ele in filtered_data: | |
# print(ele) | |
def llm_config(): | |
tagging_prompt = ChatPromptTemplate.from_template( | |
""" | |
Extract the desired information from the following list of JSON clinical trials. | |
Only extract the properties mentioned in the 'Classification' function. | |
Passage: | |
{input} | |
""" | |
) | |
class Classification(BaseModel): | |
description: str = Field( | |
description="text description grouping all the clinical trials using brief_description and detailed_description keys" | |
) | |
project_title: list = Field( | |
description="Extract the project title of all the clinical trials" | |
) | |
status: list = Field( | |
description="Extract the status of all the clinical trials" | |
) | |
keywords: list = Field( | |
description="Extract the most relevant keywords regrouping all the clinical trials" | |
) | |
interventions: list = Field( | |
description="describe the interventions for each clinical trial using title, name and description" | |
) | |
primary_outcomes: list = Field( | |
description="get the primary outcomes of each clinical trial" | |
) | |
# secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial") | |
eligibility: list = Field( | |
description="get the eligibilityCriteria grouping all the clinical trials" | |
) | |
# healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers") | |
minimum_age: list = Field( | |
description="get the minimum age from each experiment" | |
) | |
maximum_age: list = Field( | |
description="get the maximum age from each experiment" | |
) | |
gender: list = Field(description="get the gender from each experiment") | |
def get_dict(self): | |
return { | |
"summary": self.description, | |
"project_title": self.project_title, | |
"status": self.status, | |
"keywords": self.keywords, | |
"interventions": self.interventions, | |
"primary_outcomes": self.primary_outcomes, | |
# "secondary_outcomes": self.secondary_outcomes, | |
"eligibility": self.eligibility, | |
# "healthy_volunteers": self.healthy_volunteers, | |
"minimum_age": self.minimum_age, | |
"maximum_age": self.maximum_age, | |
"gender": self.gender, | |
} | |
# LLM | |
llm = ChatOpenAI( | |
temperature=0.6, | |
model="gpt-4", | |
openai_api_key=os.environ["OPENAI_API_KEY"], | |
).with_structured_output(Classification) | |
tagging_chain = tagging_prompt | llm | |
return tagging_chain | |
# clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377']) | |
# print(clinical_record_info) | |
# with open('data.json', 'w') as f: | |
# json.dump(clinical_record_info, f, indent=4) | |
tagging_chain = llm_config() | |
def process_dictionaty_with_llm_to_generate_response(json_contents): | |
processed_data = process_json_data_for_llm(json_contents) | |
res = tagging_chain.invoke({"input": processed_data}) | |
return res | |