import json from langchain_community.document_loaders.csv_loader import CSVLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import pandas as pd import langchain import os import openai import ast from langchain import OpenAI from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import JSONLoader from langchain.document_loaders import UnstructuredURLLoader from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import FAISS from langchain_core.prompts import ChatPromptTemplate from langchain_core.pydantic_v1 import BaseModel, Field from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate from langchain_core.pydantic_v1 import BaseModel, Field from langchain_openai import ChatOpenAI from typing import List, Dict, Any import requests # getting the json files def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]: # Request: # curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT00841061" \ # -H "accept: text/csv" request_url = f"https://clinicaltrials.gov/api/v2/studies/{clinical_record_id}" response = requests.get(request_url, headers={"accept": "application/json"}) return response.json() def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]: clinical_records = [] for clinical_record_id in clinical_record_ids: clinical_record_info = get_clinical_record_info(clinical_record_id) clinical_records.append(clinical_record_info) return clinical_records def process_json(json_file): # processing the files and getting the info needed # Open the JSON file for reading with open(json_file, 'r') as f: data = json.load(f) # Parse JSON data into a Python dictionary # Define the fields you want to keep fields_to_keep = ['class_of_organization', 'title', 'overallStatus', 'descriptionModule', 'conditions', 'interventions', 'outcomesModule', 'eligibilityModule'] # Iterate through the dictionary and keep only the desired fields filtered_data = [] for item in data: try: organization_name= item['protocolSection']['identificationModule']['organization']['fullName'] except: organization_name= "" try: project_title= item['protocolSection']['identificationModule']['officialTitle'] except: project_title= "" try: status= item['protocolSection']['statusModule']['overallStatus'] except: status= "" try: brief_description= item['protocolSection']['descriptionModule']['briefSummary'] except: brief_description= "" try: detailed_description= item['protocolSection']['descriptionModule']['detailedDescription'] except: detailed_description= "" try: conditions= item['protocolSection']['conditionsModule']['conditions'] except: conditions= [] try: keywords= item['protocolSection']['conditionsModule']['keywords'] except: keywords= [] try: interventions= item['protocolSection']['armsInterventionsModule']['interventions'] except: interventions= [] try: primary_outcomes= item['protocolSection']['outcomesModule']['primaryOutcomes'] except: primary_outcomes= [] try: secondary_outcomes= item['protocolSection']['outcomesModule']['secondaryOutcomes'] except: secondary_outcomes= [] try: eligibility= item['protocolSection']['eligibilityModule'] except: eligibility= {} filtered_item = {"organization_name": organization_name, "project_title": project_title, "status": status, "brief_description": brief_description, "detailed_description": detailed_description, "keywords":keywords, "interventions": interventions, "primary_outcomes": primary_outcomes, "secondary_outcomes": secondary_outcomes, "eligibility": eligibility} filtered_data.append(filtered_item) # for ele in filtered_data: # print(ele) # Write the filtered data to a new JSON file with open('output.json', 'w') as f: json.dump(filtered_data, f, indent=4) def llm_config(): tagging_prompt = ChatPromptTemplate.from_template( """ Extract the desired information from the following list of JSON clinical trials. Only extract the properties mentioned in the 'Classification' function. Passage: {input} """ ) class Classification(BaseModel): description: str = Field(description= "text description grouping all the clinical trials using brief_description and detailed_description keys") project_title: list = Field(description="Extract the project title of all the clinical trials") status: list= Field(description="Extract the status of all the clinical trials") keywords: list= Field(description="Extract the most relevant keywords regrouping all the clinical trials") interventions: list= Field(description="describe the interventions for each clinical trial using title, name and description") primary_outcomes: list= Field(description= "get the primary outcomes of each clinical trial") # secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial") eligibility: list= Field(description= "get the eligibilityCriteria grouping all the clinical trials") # healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers") minimum_age: list = Field(description="get the minimum age from each experiment") maximum_age: list = Field(description="get the maximum age from each experiment") gender: list = Field(description="get the gender from each experiment") def get_dict(self): return { "summary": self.description, "project_title": self.project_title, "status": self.status, "keywords": self.keywords, "interventions": self.interventions, "primary_outcomes": self.primary_outcomes, # "secondary_outcomes": self.secondary_outcomes, "eligibility": self.eligibility, # "healthy_volunteers": self.healthy_volunteers, "minimum_age": self.minimum_age, "maximum_age": self.maximum_age, "gender": self.gender } # LLM llm = ChatOpenAI( temperature=0.6, model="gpt-4", openai_api_key="sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ" ).with_structured_output( Classification ) tagging_chain = tagging_prompt | llm return tagging_chain def get_llm_results(results): result_dict= results.get_dict() return result_dict def save_llm_results(results_json): with open('llm_results.json', 'w') as f: json.dump(results_json, f, indent=4) # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377']) # print(clinical_record_info) # with open('data.json', 'w') as f: # json.dump(clinical_record_info, f, indent=4) # change the json file here and run it to get the output json_file= "D:/HACKUPC/hupc/klinic/data.json" process_json(json_file) with open('output.json', 'r') as file: data = json.load(file) tagging_chain= llm_config() res= tagging_chain.invoke({"input": data}) result_json= get_llm_results(res) save_llm_results(result_json) print(result_json)