Spaces:

klinic-hackupc
/

klinic

Sleeping

App Files Files Community

1-ARIjitS commited on May 5

Commit

86f6253

•

1 Parent(s): 52ee7a9

tagging included

Browse files

Files changed (1) hide show

llm_res.py +149 -54

llm_res.py CHANGED Viewed

@@ -44,23 +44,106 @@ def get_clinical_records_by_ids(clinical_record_ids: List[str]) -> List[Dict[str
     return clinical_records
-def process_json_data_for_llm(data):
-    # Define the fields you want to keep
-    fields_to_keep = [
-        "class_of_organization",
-        "title",
-        "overallStatus",
-        "descriptionModule",
-        "conditions",
-        "interventions",
-        "outcomesModule",
-        "eligibilityModule",
-    ]
     # Iterate through the dictionary and keep only the desired fields
     filtered_data = []
-    for item in data:
         try:
             organization_name = item["protocolSection"]["identificationModule"][
                 "organization"
@@ -132,22 +215,24 @@ def process_json_data_for_llm(data):
             "eligibility": eligibility,
         }
         filtered_data.append(filtered_item)
-    # for ele in filtered_data:
-    #     print(ele)
 def get_short_summary_out_of_json_files(data_json):
-    prompt_template = """ You are an expert clinician working on the analysis of reports of clinical trials.
-# Task
-You will be given a set of descriptions of clinical trials. Your job is to come up with a short summary (100-200 words) of the descriptions of the clinical trials. Your users are clinical researchers who are experts in medicine, so you should be technical and specific, including scientific terms. Always be faithful to the original information written in the reports.
-To write your summary, you will need to read the following examples, labeled as "Report 1", "Report 2", and so on. Your answer should be a single paragraph (100-200 words) that summarizes the general content of all the reports.
-{text}
-General summary:"""
     prompt = PromptTemplate.from_template(prompt_template)
@@ -178,18 +263,31 @@ General summary:"""
     print(f"Combined descriptions: {combined_descriptions}")
     result = stuff_chain.run(combined_descriptions)
-    print(f"Result: {result}")
     return result
-def taggingTemplate():
     class Classification(BaseModel):
-        description: str = Field(
-            description="text description grouping all the clinical trials using briefDescription and detailedDescription keys"
-        )
         project_title: list = Field(
-            description="Extract the project title of all the clinical trials"
         )
         status: list = Field(
             description="Extract the status of all the clinical trials"
@@ -207,43 +305,45 @@ def taggingTemplate():
         # eligibility: list = Field(
         #    description="get the eligibilityCriteria grouping all the clinical trials"
         # )
-        # healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
-        # minimum_age: list = Field(
-        #    description="get the minimum age from each experiment"
-        # )
-        # maximum_age: list = Field(
-        #    description="get the maximum age from each experiment"
-        # )
-        # gender: list = Field(description="get the gender from each experiment")
         def get_dict(self):
             return {
-                "summary": self.description,
                 "project_title": self.project_title,
                 "status": self.status,
-                "keywords": self.keywords,
                 "interventions": self.interventions,
                 "primary_outcomes": self.primary_outcomes,
                 # "secondary_outcomes": self.secondary_outcomes,
-                "eligibility": self.eligibility,
-                # "healthy_volunteers": self.healthy_volunteers,
                 "minimum_age": self.minimum_age,
                 "maximum_age": self.maximum_age,
-                "gender": self.gender,
             }
     # LLM
     llm = ChatOpenAI(
         temperature=0.6,
-        model="gpt-4",
         openai_api_key=os.environ["OPENAI_API_KEY"],
     ).with_structured_output(Classification)
-    stuff_chain = StuffDocumentsChain(llm_chain=llm, document_variable_name="text")
-    # tagging_chain = prompt_template | llm
-    # return tagging_chain
 # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
@@ -252,10 +352,5 @@ def taggingTemplate():
 # with open('data.json', 'w') as f:
 #     json.dump(clinical_record_info, f, indent=4)
-# tagging_chain = llm_config()
-def process_dictionaty_with_llm_to_generate_response(json_contents):
-    processed_data = process_json_data_for_llm(json_contents)
-    # res = tagging_chain.invoke({"input": processed_data})
-    # return res

     return clinical_records
+# # def process_json_data_for_llm(data):
+#     # Define the fields you want to keep
+#     fields_to_keep = [
+#         "class_of_organization",
+#         "title",
+#         "overallStatus",
+#         "descriptionModule",
+#         "conditions",
+#         "interventions",
+#         "outcomesModule",
+#         "eligibilityModule",
+#     ]
+#     # Iterate through the dictionary and keep only the desired fields
+#     filtered_data = []
+#     for item in data:
+#         try:
+#             organization_name = item["protocolSection"]["identificationModule"][
+#                 "organization"
+#             ]["fullName"]
+#         except:
+#             organization_name = ""
+#         try:
+#             project_title = item["protocolSection"]["identificationModule"][
+#                 "officialTitle"
+#             ]
+#         except:
+#             project_title = ""
+#         try:
+#             status = item["protocolSection"]["statusModule"]["overallStatus"]
+#         except:
+#             status = ""
+#         try:
+#             briefDescription = item["protocolSection"]["descriptionModule"][
+#                 "briefSummary"
+#             ]
+#         except:
+#             briefDescription = ""
+#         try:
+#             detailedDescription = item["protocolSection"]["descriptionModule"][
+#                 "detailedDescription"
+#             ]
+#         except:
+#             detailedDescription = ""
+#         try:
+#             conditions = item["protocolSection"]["conditionsModule"]["conditions"]
+#         except:
+#             conditions = []
+#         try:
+#             keywords = item["protocolSection"]["conditionsModule"]["keywords"]
+#         except:
+#             keywords = []
+#         try:
+#             interventions = item["protocolSection"]["armsInterventionsModule"][
+#                 "interventions"
+#             ]
+#         except:
+#             interventions = []
+#         try:
+#             primary_outcomes = item["protocolSection"]["outcomesModule"][
+#                 "primaryOutcomes"
+#             ]
+#         except:
+#             primary_outcomes = []
+#         try:
+#             secondary_outcomes = item["protocolSection"]["outcomesModule"][
+#                 "secondaryOutcomes"
+#             ]
+#         except:
+#             secondary_outcomes = []
+#         try:
+#             eligibility = item["protocolSection"]["eligibilityModule"]
+#         except:
+#             eligibility = {}
+#         filtered_item = {
+#             "organization_name": organization_name,
+#             "project_title": project_title,
+#             "status": status,
+#             "briefDescription": briefDescription,
+#             "detailedDescription": detailedDescription,
+#             "keywords": keywords,
+#             "interventions": interventions,
+#             "primary_outcomes": primary_outcomes,
+#             "secondary_outcomes": secondary_outcomes,
+#             "eligibility": eligibility,
+#         }
+#         filtered_data.append(filtered_item)
+#     return filtered_data
+#     # for ele in filtered_data:
+#     #     print(ele)
+def process_dictionaty_with_llm_to_generate_response(json_data):
+    # processed_data = process_json_data_for_llm(json_data)
+    # res = tagging_chain.invoke({"input": processed_data})
+    # return res
     # Iterate through the dictionary and keep only the desired fields
     filtered_data = []
+    for item in json_data:
         try:
             organization_name = item["protocolSection"]["identificationModule"][
                 "organization"
             "eligibility": eligibility,
         }
         filtered_data.append(filtered_item)
+    return filtered_data
 def get_short_summary_out_of_json_files(data_json):
+#     prompt_template = """ You are an expert clinician working on the analysis of reports of clinical trials.
+#       # Task
+#       You will be given a set of descriptions of clinical trials. Your job is to come up with a short summary (100-200 words) of the descriptions of the clinical trials. Your users are clinical researchers who are experts in medicine, so you should be technical and specific, including scientific terms. Always be faithful to the original information written in the reports.
+#       To write your summary, you will need to read the following examples, labeled as "Report 1", "Report 2", and so on. Your answer should be a single paragraph (100-200 words) that summarizes the general content of all the reports.
+# {text}
+# General summary:"""
+    prompt_template = """ You are an expert on clinicial trials and their analysis of their reports.
+          # Task
+          You will be given a text of descriptions of multiple clinical trials realed to similar diseases. Your job is to come up with a short and detailed summary of the descriptions of the clinical trials. Your users are clinical researchers, so you should be technical and specific, including scientific terms in the summary."""
     prompt = PromptTemplate.from_template(prompt_template)
     print(f"Combined descriptions: {combined_descriptions}")
     result = stuff_chain.run(combined_descriptions)
+    print(f"Result_summarization: {result}")
     return result
+def tagging_insights_from_json(data_json):
+    processed_json= process_dictionaty_with_llm_to_generate_response(data_json)
+    tagging_prompt = ChatPromptTemplate.from_template(
+        """
+    You are an expert on clinicial trials and analysis of their reports.
+    Extract the desired information from the following JSON data.
+    Only extract the properties mentioned in the 'Classification' function.
+    JSON data:
+    {input}
+    """
+    )
     class Classification(BaseModel):
+        # description: str = Field(
+        #     description="text description grouping all the clinical trials using briefDescription and detailedDescription keys"
+        # )
         project_title: list = Field(
+            description="Extract the project titles of all the clinical trials"
         )
         status: list = Field(
             description="Extract the status of all the clinical trials"
         # eligibility: list = Field(
         #    description="get the eligibilityCriteria grouping all the clinical trials"
         # )
+        healthy_volunteers: list= Field(description= "determine whether the clinical trial requires healthy volunteers")
+        minimum_age: list = Field(
+           description="get the minimum age from each experiment"
+        )
+        maximum_age: list = Field(
+           description="get the maximum age from each experiment"
+        )
+        gender: list = Field(description="get the gender from each experiment")
         def get_dict(self):
             return {
                 "project_title": self.project_title,
                 "status": self.status,
+                # "keywords": self.keywords,
                 "interventions": self.interventions,
                 "primary_outcomes": self.primary_outcomes,
                 # "secondary_outcomes": self.secondary_outcomes,
+                # "eligibility": self.eligibility,
+                "healthy_volunteers": self.healthy_volunteers,
                 "minimum_age": self.minimum_age,
                 "maximum_age": self.maximum_age,
+                "gender": self.gender
             }
     # LLM
     llm = ChatOpenAI(
         temperature=0.6,
+        model="gpt-4-turbo",
         openai_api_key=os.environ["OPENAI_API_KEY"],
     ).with_structured_output(Classification)
+    # stuff_chain = StuffDocumentsChain(llm_chain=llm, document_variable_name="text")
+    tagging_chain = tagging_prompt | llm
+    res= tagging_chain.invoke({"input": processed_json})
+    result_dict= res.get_dict()
+    print(f"Result_tagging: {result_dict}")
+    return result_dict
 # clinical_record_info = get_clinical_records_by_ids(['NCT00841061', 'NCT03035123', 'NCT02272751', 'NCT03035123', 'NCT03055377'])
 # with open('data.json', 'w') as f:
 #     json.dump(clinical_record_info, f, indent=4)
+# tagging_chain = tagging_insights_from_json(json_data)