klinic / llm_res.py
1-ARIjitS
llm for getting the output added
0b336c0
raw
history blame
8.12 kB
import json
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import langchain
import os
import openai
import ast
from langchain import OpenAI
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import JSONLoader
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from typing import List, Dict, Any
import requests
# getting the json files
def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]:
# Request:
# curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT00841061" \
# -H "accept: text/csv"
request_url = f"https://clinicaltrials.gov/api/v2/studies/{clinical_record_id}"
response = requests.get(request_url, headers={"accept": "application/json"})
return response.json()
def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]:
clinical_records = []
for clinical_record_id in clinical_record_ids:
clinical_record_info = get_clinical_record_info(clinical_record_id)
clinical_records.append(clinical_record_info)
return clinical_records
def process_json(json_file):
# processing the files and getting the info needed
# Open the JSON file for reading
with open(json_file, 'r') as f:
data = json.load(f) # Parse JSON data into a Python dictionary
# Define the fields you want to keep
fields_to_keep = ['class_of_organization', 'title', 'overallStatus', 'descriptionModule', 'conditions', 'interventions', 'outcomesModule', 'eligibilityModule']
# Iterate through the dictionary and keep only the desired fields
filtered_data = []
for item in data:
try:
organization_name= item['protocolSection']['identificationModule']['organization']['fullName']
except:
organization_name= ""
try:
project_title= item['protocolSection']['identificationModule']['officialTitle']
except:
project_title= ""
try:
status= item['protocolSection']['statusModule']['overallStatus']
except:
status= ""
try:
brief_description= item['protocolSection']['descriptionModule']['briefSummary']
except:
brief_description= ""
try:
detailed_description= item['protocolSection']['descriptionModule']['detailedDescription']
except:
detailed_description= ""
try:
conditions= item['protocolSection']['conditionsModule']['conditions']
except:
conditions= []
try:
keywords= item['protocolSection']['conditionsModule']['keywords']
except:
keywords= []
try:
interventions= item['protocolSection']['armsInterventionsModule']['interventions']
except:
interventions= []
try:
primary_outcomes= item['protocolSection']['outcomesModule']['primaryOutcomes']
except:
primary_outcomes= []
try:
secondary_outcomes= item['protocolSection']['outcomesModule']['secondaryOutcomes']
except:
secondary_outcomes= []
try:
eligibility= item['protocolSection']['eligibilityModule']
except:
eligibility= {}
filtered_item = {"organization_name": organization_name,
"project_title": project_title,
"status": status,
"brief_description": brief_description,
"detailed_description": detailed_description,
"keywords":keywords,
"interventions": interventions,
"primary_outcomes": primary_outcomes,
"secondary_outcomes": secondary_outcomes,
"eligibility": eligibility}
filtered_data.append(filtered_item)
# for ele in filtered_data:
# print(ele)
# Write the filtered data to a new JSON file
with open('output.json', 'w') as f:
json.dump(filtered_data, f, indent=4)
def llm_config():
tagging_prompt = ChatPromptTemplate.from_template(
"""
Extract the desired information from the following list of JSON clinical trials.
Only extract the properties mentioned in the 'Classification' function.
Passage:
{input}
"""
)
class Classification(BaseModel):
description: str = Field(description= "text description grouping all the clinical trials using brief_description and detailed_description keys")
project_title: list = Field(description="Extract the project title of all the clinical trials")
status: list= Field(description="Extract the status of all the clinical trials")
keywords: list= Field(description="Extract the most relevant keywords regrouping all the clinical trials")
interventions: list= Field(description="describe the interventions for each clinical trial using title, name and description")
primary_outcomes: list= Field(description= "get the primary outcomes of each clinical trial")
# secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial")
eligibility: list= Field(description= "get the eligibilityCriteria grouping all the clinical trials")
# healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
minimum_age: list = Field(description="get the minimum age from each experiment")
maximum_age: list = Field(description="get the maximum age from each experiment")
gender: list = Field(description="get the gender from each experiment")
def get_dict(self):
return {
"summary": self.description,
"project_title": self.project_title,
"status": self.status,
"keywords": self.keywords,
"interventions": self.interventions,
"primary_outcomes": self.primary_outcomes,
# "secondary_outcomes": self.secondary_outcomes,
"eligibility": self.eligibility,
# "healthy_volunteers": self.healthy_volunteers,
"minimum_age": self.minimum_age,
"maximum_age": self.maximum_age,
"gender": self.gender
}
# LLM
llm = ChatOpenAI(
temperature=0.6,
model="gpt-4",
openai_api_key="sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ"
).with_structured_output(
Classification
)
tagging_chain = tagging_prompt | llm
return tagging_chain
def get_llm_results(results):
result_dict= results.get_dict()
return result_dict
def save_llm_results(results_json):
with open('llm_results.json', 'w') as f:
json.dump(results_json, f, indent=4)
# clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
# print(clinical_record_info)
# with open('data.json', 'w') as f:
# json.dump(clinical_record_info, f, indent=4)
# change the json file here and run it to get the output
json_file= "D:/HACKUPC/hupc/klinic/data.json"
process_json(json_file)
with open('output.json', 'r') as file:
data = json.load(file)
tagging_chain= llm_config()
res= tagging_chain.invoke({"input": data})
result_json= get_llm_results(res)
save_llm_results(result_json)
print(result_json)