1-ARIjitS commited on
Commit
0b336c0
1 Parent(s): 27d40b9

llm for getting the output added

Browse files
Files changed (1) hide show
  1. llm_res.py +197 -0
llm_res.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from langchain_community.document_loaders.csv_loader import CSVLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ import pandas as pd
5
+ import langchain
6
+ import os
7
+ import openai
8
+ import ast
9
+ from langchain import OpenAI
10
+ from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain_community.document_loaders import JSONLoader
13
+ from langchain.document_loaders import UnstructuredURLLoader
14
+ from langchain.embeddings import OpenAIEmbeddings
15
+ from langchain.vectorstores import FAISS
16
+ from langchain_core.prompts import ChatPromptTemplate
17
+ from langchain_core.pydantic_v1 import BaseModel, Field
18
+ from langchain_openai import ChatOpenAI
19
+ from langchain_core.prompts import ChatPromptTemplate
20
+ from langchain_core.pydantic_v1 import BaseModel, Field
21
+ from langchain_openai import ChatOpenAI
22
+ from typing import List, Dict, Any
23
+ import requests
24
+
25
+ # getting the json files
26
+ def get_clinical_record_info(clinical_record_id: str) -> Dict[str, Any]:
27
+ # Request:
28
+ # curl -X GET "https://clinicaltrials.gov/api/v2/studies/NCT00841061" \
29
+ # -H "accept: text/csv"
30
+ request_url = f"https://clinicaltrials.gov/api/v2/studies/{clinical_record_id}"
31
+ response = requests.get(request_url, headers={"accept": "application/json"})
32
+ return response.json()
33
+
34
+ def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str, Any]]:
35
+ clinical_records = []
36
+ for clinical_record_id in clinical_record_ids:
37
+ clinical_record_info = get_clinical_record_info(clinical_record_id)
38
+ clinical_records.append(clinical_record_info)
39
+ return clinical_records
40
+
41
+ def process_json(json_file):
42
+ # processing the files and getting the info needed
43
+ # Open the JSON file for reading
44
+ with open(json_file, 'r') as f:
45
+ data = json.load(f) # Parse JSON data into a Python dictionary
46
+
47
+ # Define the fields you want to keep
48
+ fields_to_keep = ['class_of_organization', 'title', 'overallStatus', 'descriptionModule', 'conditions', 'interventions', 'outcomesModule', 'eligibilityModule']
49
+
50
+ # Iterate through the dictionary and keep only the desired fields
51
+ filtered_data = []
52
+ for item in data:
53
+ try:
54
+ organization_name= item['protocolSection']['identificationModule']['organization']['fullName']
55
+ except:
56
+ organization_name= ""
57
+ try:
58
+ project_title= item['protocolSection']['identificationModule']['officialTitle']
59
+ except:
60
+ project_title= ""
61
+ try:
62
+ status= item['protocolSection']['statusModule']['overallStatus']
63
+ except:
64
+ status= ""
65
+ try:
66
+ brief_description= item['protocolSection']['descriptionModule']['briefSummary']
67
+ except:
68
+ brief_description= ""
69
+ try:
70
+ detailed_description= item['protocolSection']['descriptionModule']['detailedDescription']
71
+ except:
72
+ detailed_description= ""
73
+ try:
74
+ conditions= item['protocolSection']['conditionsModule']['conditions']
75
+ except:
76
+ conditions= []
77
+ try:
78
+ keywords= item['protocolSection']['conditionsModule']['keywords']
79
+ except:
80
+ keywords= []
81
+ try:
82
+ interventions= item['protocolSection']['armsInterventionsModule']['interventions']
83
+ except:
84
+ interventions= []
85
+ try:
86
+ primary_outcomes= item['protocolSection']['outcomesModule']['primaryOutcomes']
87
+ except:
88
+ primary_outcomes= []
89
+ try:
90
+ secondary_outcomes= item['protocolSection']['outcomesModule']['secondaryOutcomes']
91
+ except:
92
+ secondary_outcomes= []
93
+ try:
94
+ eligibility= item['protocolSection']['eligibilityModule']
95
+ except:
96
+ eligibility= {}
97
+ filtered_item = {"organization_name": organization_name,
98
+ "project_title": project_title,
99
+ "status": status,
100
+ "brief_description": brief_description,
101
+ "detailed_description": detailed_description,
102
+ "keywords":keywords,
103
+ "interventions": interventions,
104
+ "primary_outcomes": primary_outcomes,
105
+ "secondary_outcomes": secondary_outcomes,
106
+ "eligibility": eligibility}
107
+ filtered_data.append(filtered_item)
108
+
109
+ # for ele in filtered_data:
110
+ # print(ele)
111
+
112
+ # Write the filtered data to a new JSON file
113
+ with open('output.json', 'w') as f:
114
+ json.dump(filtered_data, f, indent=4)
115
+
116
+ def llm_config():
117
+ tagging_prompt = ChatPromptTemplate.from_template(
118
+ """
119
+ Extract the desired information from the following list of JSON clinical trials.
120
+
121
+ Only extract the properties mentioned in the 'Classification' function.
122
+
123
+ Passage:
124
+ {input}
125
+
126
+ """
127
+ )
128
+
129
+ class Classification(BaseModel):
130
+ description: str = Field(description= "text description grouping all the clinical trials using brief_description and detailed_description keys")
131
+ project_title: list = Field(description="Extract the project title of all the clinical trials")
132
+ status: list= Field(description="Extract the status of all the clinical trials")
133
+ keywords: list= Field(description="Extract the most relevant keywords regrouping all the clinical trials")
134
+ interventions: list= Field(description="describe the interventions for each clinical trial using title, name and description")
135
+ primary_outcomes: list= Field(description= "get the primary outcomes of each clinical trial")
136
+ # secondary_outcomes: list= Field(description= "get the secondary outcomes of each clinical trial")
137
+ eligibility: list= Field(description= "get the eligibilityCriteria grouping all the clinical trials")
138
+ # healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
139
+ minimum_age: list = Field(description="get the minimum age from each experiment")
140
+ maximum_age: list = Field(description="get the maximum age from each experiment")
141
+ gender: list = Field(description="get the gender from each experiment")
142
+
143
+ def get_dict(self):
144
+ return {
145
+ "summary": self.description,
146
+ "project_title": self.project_title,
147
+ "status": self.status,
148
+ "keywords": self.keywords,
149
+ "interventions": self.interventions,
150
+ "primary_outcomes": self.primary_outcomes,
151
+ # "secondary_outcomes": self.secondary_outcomes,
152
+ "eligibility": self.eligibility,
153
+ # "healthy_volunteers": self.healthy_volunteers,
154
+ "minimum_age": self.minimum_age,
155
+ "maximum_age": self.maximum_age,
156
+ "gender": self.gender
157
+ }
158
+
159
+ # LLM
160
+ llm = ChatOpenAI(
161
+ temperature=0.6,
162
+ model="gpt-4",
163
+ openai_api_key="sk-proj-CG2E98bSWs53X2eWO0Z4T3BlbkFJLm7H1vfkbua0zP548CKQ"
164
+ ).with_structured_output(
165
+ Classification
166
+ )
167
+
168
+ tagging_chain = tagging_prompt | llm
169
+
170
+ return tagging_chain
171
+
172
+ def get_llm_results(results):
173
+ result_dict= results.get_dict()
174
+ return result_dict
175
+
176
+ def save_llm_results(results_json):
177
+ with open('llm_results.json', 'w') as f:
178
+ json.dump(results_json, f, indent=4)
179
+
180
+ # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
181
+ # print(clinical_record_info)
182
+
183
+ # with open('data.json', 'w') as f:
184
+ # json.dump(clinical_record_info, f, indent=4)
185
+
186
+ # change the json file here and run it to get the output
187
+ json_file= "D:/HACKUPC/hupc/klinic/data.json"
188
+ process_json(json_file)
189
+
190
+ with open('output.json', 'r') as file:
191
+ data = json.load(file)
192
+
193
+ tagging_chain= llm_config()
194
+ res= tagging_chain.invoke({"input": data})
195
+ result_json= get_llm_results(res)
196
+ save_llm_results(result_json)
197
+ print(result_json)