XufengDuan commited on
Commit
ce1e7cd
1 Parent(s): 27d8f5d

update scripts

Browse files
src/backend/model_operations.py CHANGED
@@ -394,7 +394,20 @@ class ResponseGenerator:
394
  result = result.replace(prompt[0], '')
395
  print(result)
396
  return result
397
-
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  elif self.local_model is None:
400
  import random
@@ -474,20 +487,7 @@ class ResponseGenerator:
474
  result = convo.last.text
475
  print(result)
476
  return result
477
- # Using OpenAI API
478
- elif 'gpt' in self.model_id.lower():
479
- response = litellm.completion(
480
- model=self.model_id.replace('openai/',''),
481
- messages=[{"role": "system", "content": system_prompt},
482
- {"role": "user", "content": user_prompt}],
483
- # temperature=0.0,
484
- max_tokens=100,
485
- api_key = os.getenv('OpenAI_key')
486
- )
487
- result = response['choices'][0]['message']['content']
488
- # print()
489
- print(result)
490
- return result
491
  # exit()
492
  # Using local model
493
 
@@ -640,7 +640,7 @@ class EvaluationModel:
640
  filtered_lines.insert(0, lines[0])
641
  else:
642
  filtered_lines = lines
643
- print(filtered_lines)
644
 
645
  filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
646
  rs = "\n".join(filtered_lines)
@@ -884,6 +884,7 @@ class EvaluationModel:
884
  human_e5 = create_e5_entries(human_df)
885
  llm_e5 = create_e5_entries(llm_df)
886
 
 
887
  # Remove E5 and E51 entries from both datasets
888
  human_df = human_df[~human_df['Question_ID'].str.contains('E5')]
889
  llm_df = llm_df[~llm_df['Question_ID'].str.contains('E5')]
@@ -895,10 +896,15 @@ class EvaluationModel:
895
 
896
  ### Calculate Average JS Divergence ###
897
 
 
898
  # Extract the relevant columns for JS divergence calculation
899
  human_responses = human_df[['Question_ID', 'Coding']]
900
  llm_responses = llm_df[['Question_ID', 'Coding']]
901
 
 
 
 
 
902
  # Get unique Question_IDs present in both datasets
903
  common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
904
 
 
394
  result = result.replace(prompt[0], '')
395
  print(result)
396
  return result
397
+ # Using OpenAI API
398
+ elif 'gpt' in self.model_id.lower():
399
+ response = litellm.completion(
400
+ model=self.model_id.replace('openai/', ''),
401
+ messages=[{"role": "system", "content": system_prompt},
402
+ {"role": "user", "content": user_prompt}],
403
+ # temperature=0.0,
404
+ max_tokens=100,
405
+ api_key=os.getenv('OpenAI_key')
406
+ )
407
+ result = response['choices'][0]['message']['content']
408
+ # print()
409
+ # print(result)
410
+ return result
411
 
412
  elif self.local_model is None:
413
  import random
 
487
  result = convo.last.text
488
  print(result)
489
  return result
490
+
 
 
 
 
 
 
 
 
 
 
 
 
 
491
  # exit()
492
  # Using local model
493
 
 
640
  filtered_lines.insert(0, lines[0])
641
  else:
642
  filtered_lines = lines
643
+ # print(filtered_lines)
644
 
645
  filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
646
  rs = "\n".join(filtered_lines)
 
884
  human_e5 = create_e5_entries(human_df)
885
  llm_e5 = create_e5_entries(llm_df)
886
 
887
+
888
  # Remove E5 and E51 entries from both datasets
889
  human_df = human_df[~human_df['Question_ID'].str.contains('E5')]
890
  llm_df = llm_df[~llm_df['Question_ID'].str.contains('E5')]
 
896
 
897
  ### Calculate Average JS Divergence ###
898
 
899
+
900
  # Extract the relevant columns for JS divergence calculation
901
  human_responses = human_df[['Question_ID', 'Coding']]
902
  llm_responses = llm_df[['Question_ID', 'Coding']]
903
 
904
+ # Remove 'Other' responses
905
+ human_responses = human_responses[human_responses['Coding'] != 'Other']
906
+ llm_responses = llm_responses[llm_responses['Coding'] != 'Other']
907
+
908
  # Get unique Question_IDs present in both datasets
909
  common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
910
 
src/display/about.py CHANGED
@@ -40,7 +40,19 @@ TITLE = """<h1 align="center" id="space-title">Humanlike Evaluation Model (HEM)
40
 
41
  # What does your leaderboard evaluate?
42
  INTRODUCTION_TEXT = """
43
- This leaderboard (by [Xufeng Duan](https://xufengduan.github.io/)) evaluates the similarities between human and model responses in language use <br>
 
 
 
 
 
 
 
 
 
 
 
 
44
  """
45
 
46
  # Which evaluations are you running? how can people reproduce what you have?
 
40
 
41
  # What does your leaderboard evaluate?
42
  INTRODUCTION_TEXT = """
43
+ Welcome to the Humanlikeness Leaderboard, curated by [Xufeng Duan](https://xufengduan.github.io/). This platform rigorously evaluates the alignment between human and model responses in language processing, utilizing ten carefully designed psycholinguistic tasks to quantify a model's humanlikeness:<br><br>
44
+ 1. **Sounds:** Sound Shape Association<br>
45
+ 2. **Sounds:** Sound Gender Association<br>
46
+ 3. **Word:** Word Length and Predictivity<br>
47
+ 4. **Word:** Word Meaning Priming<br>
48
+ 5. **Syntax:** Structural Priming<br>
49
+ 6. **Syntax:** Syntactic Ambiguity Resolution<br>
50
+ 7. **Meaning:** Implausible Sentence Interpretation<br>
51
+ 8. **Meaning:** Semantic Illusion<br>
52
+ 9. **Discourse:** Implicit Causality<br>
53
+ 10. **Discourse:** Drawing Inferences<br><br>
54
+ Each task is composed of multiple stimuli, designed to elicit both expected and unexpected responses. We have gathered data from 2000 human participants, generating response distributions that reflect natural human behavior across these tasks. By presenting identical stimuli to advanced language models, we generate corresponding response distributions for comparison.<br><br>
55
+ The degree of congruence between these human and model distributions offers a precise measure of the model's humanlikeness.<br>
56
  """
57
 
58
  # Which evaluations are you running? how can people reproduce what you have?