Spaces:

klinic-hackupc
/

klinic

Sleeping

klinic / llm_res.py

1-ARIjitS

llm for getting the output added

0b336c0 6 months ago

8.12 kB

	import json
	from langchain_community.document_loaders.csv_loader import CSVLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import pandas as pd
	import langchain
	import os
	import openai
	import ast
	from langchain import OpenAI
	from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import JSONLoader
	from langchain.document_loaders import UnstructuredURLLoader
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.pydantic_v1 import BaseModel, Field
	from langchain_openai import ChatOpenAI
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.pydantic_v1 import BaseModel, Field
	from langchain_openai import ChatOpenAI
	from typing import List, Dict, Any
	import requests

	# getting the json files
	def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]:
	# Request:
	# curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT00841061" \
	# -H "accept: text/csv"
	request_url = f"https://clinicaltrials.gov/api/v2/studies/{clinical_record_id}"
	response = requests.get(request_url, headers={"accept": "application/json"})
	return response.json()

	def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]:
	clinical_records = []
	for clinical_record_id in clinical_record_ids:
	clinical_record_info = get_clinical_record_info(clinical_record_id)
	clinical_records.append(clinical_record_info)
	return clinical_records

	def process_json(json_file):
	# processing the files and getting the info needed
	# Open the JSON file for reading
	with open(json_file, 'r') as f:
	data = json.load(f) # Parse JSON data into a Python dictionary

	# Define the fields you want to keep
	fields_to_keep = ['class_of_organization', 'title', 'overallStatus', 'descriptionModule', 'conditions', 'interventions', 'outcomesModule', 'eligibilityModule']

	# Iterate through the dictionary and keep only the desired fields
	filtered_data = []
	for item in data:
	try:
	organization_name= item['protocolSection']['identificationModule']['organization']['fullName']
	except:
	organization_name= ""
	try:
	project_title= item['protocolSection']['identificationModule']['officialTitle']
	except:
	project_title= ""
	try:
	status= item['protocolSection']['statusModule']['overallStatus']
	except:
	status= ""
	try:
	brief_description= item['protocolSection']['descriptionModule']['briefSummary']
	except:
	brief_description= ""
	try:
	detailed_description= item['protocolSection']['descriptionModule']['detailedDescription']
	except:
	detailed_description= ""
	try:
	conditions= item['protocolSection']['conditionsModule']['conditions']
	except:
	conditions= []
	try:
	keywords= item['protocolSection']['conditionsModule']['keywords']
	except:
	keywords= []
	try:
	interventions= item['protocolSection']['armsInterventionsModule']['interventions']
	except:
	interventions= []
	try:
	primary_outcomes= item['protocolSection']['outcomesModule']['primaryOutcomes']
	except:
	primary_outcomes= []
	try:
	secondary_outcomes= item['protocolSection']['outcomesModule']['secondaryOutcomes']
	except:
	secondary_outcomes= []
	try:
	eligibility= item['protocolSection']['eligibilityModule']
	except:
	eligibility= {}
	filtered_item = {"organization_name": organization_name,
	"project_title": project_title,
	"status": status,
	"brief_description": brief_description,
	"detailed_description": detailed_description,
	"keywords":keywords,
	"interventions": interventions,
	"primary_outcomes": primary_outcomes,
	"secondary_outcomes": secondary_outcomes,
	"eligibility": eligibility}
	filtered_data.append(filtered_item)

	# for ele in filtered_data:
	# print(ele)

	# Write the filtered data to a new JSON file
	with open('output.json', 'w') as f:
	json.dump(filtered_data, f, indent=4)

	def llm_config():
	tagging_prompt = ChatPromptTemplate.from_template(
	"""
	Extract the desired information from the following list of JSON clinical trials.

	Only extract the properties mentioned in the 'Classification' function.

	Passage:
	{input}

	"""
	)

	class Classification(BaseModel):
	description: str = Field(description= "text description grouping all the clinical trials using brief_description and detailed_description keys")
	project_title: list = Field(description="Extract the project title of all the clinical trials")
	status: list= Field(description="Extract the status of all the clinical trials")
	keywords: list= Field(description="Extract the most relevant keywords regrouping all the clinical trials")
	interventions: list= Field(description="describe the interventions for each clinical trial using title, name and description")
	primary_outcomes: list= Field(description= "get the primary outcomes of each clinical trial")
	# secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial")
	eligibility: list= Field(description= "get the eligibilityCriteria grouping all the clinical trials")
	# healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
	minimum_age: list = Field(description="get the minimum age from each experiment")
	maximum_age: list = Field(description="get the maximum age from each experiment")
	gender: list = Field(description="get the gender from each experiment")

	def get_dict(self):
	return {
	"summary": self.description,
	"project_title": self.project_title,
	"status": self.status,
	"keywords": self.keywords,
	"interventions": self.interventions,
	"primary_outcomes": self.primary_outcomes,
	# "secondary_outcomes": self.secondary_outcomes,
	"eligibility": self.eligibility,
	# "healthy_volunteers": self.healthy_volunteers,
	"minimum_age": self.minimum_age,
	"maximum_age": self.maximum_age,
	"gender": self.gender
	}

	# LLM
	llm = ChatOpenAI(
	temperature=0.6,
	model="gpt-4",
	openai_api_key="sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ"
	).with_structured_output(
	Classification
	)

	tagging_chain = tagging_prompt \| llm

	return tagging_chain

	def get_llm_results(results):
	result_dict= results.get_dict()
	return result_dict

	def save_llm_results(results_json):
	with open('llm_results.json', 'w') as f:
	json.dump(results_json, f, indent=4)

	# clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
	# print(clinical_record_info)

	# with open('data.json', 'w') as f:
	# json.dump(clinical_record_info, f, indent=4)

	# change the json file here and run it to get the output
	json_file= "D:/HACKUPC/hupc/klinic/data.json"
	process_json(json_file)

	with open('output.json', 'r') as file:
	data = json.load(file)

	tagging_chain= llm_config()
	res= tagging_chain.invoke({"input": data})
	result_json= get_llm_results(res)
	save_llm_results(result_json)
	print(result_json)