Spaces:

XufengDuan
/

HumanLikeness

Sleeping

App Files Files Community

XufengDuan commited on Aug 27

Commit

ce1e7cd

•

1 Parent(s): 27d8f5d

update scripts

Browse files

Files changed (2) hide show

src/backend/model_operations.py +22 -16
src/display/about.py +13 -1

src/backend/model_operations.py CHANGED Viewed

@@ -394,7 +394,20 @@ class ResponseGenerator:
                 result = result.replace(prompt[0], '')
                 print(result)
             return result
         elif self.local_model is None:
             import random
@@ -474,20 +487,7 @@ class ResponseGenerator:
             result = convo.last.text
             print(result)
             return result
-        # Using OpenAI API
-        elif 'gpt' in self.model_id.lower():
-            response = litellm.completion(
-                model=self.model_id.replace('openai/',''),
-                messages=[{"role": "system", "content": system_prompt},
-                        {"role": "user", "content": user_prompt}],
-                # temperature=0.0,
-                max_tokens=100,
-                api_key = os.getenv('OpenAI_key')
-            )
-            result = response['choices'][0]['message']['content']
-            # print()
-            print(result)
-            return result
         # exit()
         # Using local model
@@ -640,7 +640,7 @@ class EvaluationModel:
                     filtered_lines.insert(0, lines[0])
                 else:
                     filtered_lines = lines
-                print(filtered_lines)
                 filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
                 rs = "\n".join(filtered_lines)
@@ -884,6 +884,7 @@ class EvaluationModel:
         human_e5 = create_e5_entries(human_df)
         llm_e5 = create_e5_entries(llm_df)
         # Remove E5 and E51 entries from both datasets
         human_df = human_df[~human_df['Question_ID'].str.contains('E5')]
         llm_df = llm_df[~llm_df['Question_ID'].str.contains('E5')]
@@ -895,10 +896,15 @@ class EvaluationModel:
         ### Calculate Average JS Divergence ###
         # Extract the relevant columns for JS divergence calculation
         human_responses = human_df[['Question_ID', 'Coding']]
         llm_responses = llm_df[['Question_ID', 'Coding']]
         # Get unique Question_IDs present in both datasets
         common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))

                 result = result.replace(prompt[0], '')
                 print(result)
             return result
+            # Using OpenAI API
+        elif 'gpt' in self.model_id.lower():
+            response = litellm.completion(
+                model=self.model_id.replace('openai/', ''),
+                messages=[{"role": "system", "content": system_prompt},
+                          {"role": "user", "content": user_prompt}],
+                # temperature=0.0,
+                max_tokens=100,
+                api_key=os.getenv('OpenAI_key')
+            )
+            result = response['choices'][0]['message']['content']
+            # print()
+            # print(result)
+            return result
         elif self.local_model is None:
             import random
             result = convo.last.text
             print(result)
             return result
         # exit()
         # Using local model
                     filtered_lines.insert(0, lines[0])
                 else:
                     filtered_lines = lines
+                # print(filtered_lines)
                 filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
                 rs = "\n".join(filtered_lines)
         human_e5 = create_e5_entries(human_df)
         llm_e5 = create_e5_entries(llm_df)
         # Remove E5 and E51 entries from both datasets
         human_df = human_df[~human_df['Question_ID'].str.contains('E5')]
         llm_df = llm_df[~llm_df['Question_ID'].str.contains('E5')]
         ### Calculate Average JS Divergence ###
         # Extract the relevant columns for JS divergence calculation
         human_responses = human_df[['Question_ID', 'Coding']]
         llm_responses = llm_df[['Question_ID', 'Coding']]
+        # Remove 'Other' responses
+        human_responses = human_responses[human_responses['Coding'] != 'Other']
+        llm_responses = llm_responses[llm_responses['Coding'] != 'Other']
         # Get unique Question_IDs present in both datasets
         common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))

src/display/about.py CHANGED Viewed

@@ -40,7 +40,19 @@ TITLE = """<h1 align="center" id="space-title">Humanlike Evaluation Model (HEM)
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-This leaderboard (by [Xufeng Duan](https://xufengduan.github.io/)) evaluates the similarities between human and model responses in language use <br>
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Welcome to the Humanlikeness Leaderboard, curated by [Xufeng Duan](https://xufengduan.github.io/). This platform rigorously evaluates the alignment between human and model responses in language processing, utilizing ten carefully designed psycholinguistic tasks to quantify a model's humanlikeness:<br><br>
+1. **Sounds:** Sound Shape Association<br>
+2. **Sounds:** Sound Gender Association<br>
+3. **Word:** Word Length and Predictivity<br>
+4. **Word:** Word Meaning Priming<br>
+5. **Syntax:** Structural Priming<br>
+6. **Syntax:** Syntactic Ambiguity Resolution<br>
+7. **Meaning:** Implausible Sentence Interpretation<br>
+8. **Meaning:** Semantic Illusion<br>
+9. **Discourse:** Implicit Causality<br>
+10. **Discourse:** Drawing Inferences<br><br>
+Each task is composed of multiple stimuli, designed to elicit both expected and unexpected responses. We have gathered data from 2000 human participants, generating response distributions that reflect natural human behavior across these tasks. By presenting identical stimuli to advanced language models, we generate corresponding response distributions for comparison.<br><br>
+The degree of congruence between these human and model distributions offers a precise measure of the model's humanlikeness.<br>
 """
 # Which evaluations are you running? how can people reproduce what you have?