lilac / data /lilac.yml
nsthorat-lilac's picture
Push to HF space
4ebad3a verified
raw
history blame
11.7 kB
datasets:
- namespace: lilac
name: Capybara
source:
dataset_name: LDJnr/Capybara
source_name: huggingface
embeddings:
- path:
- conversation
- '*'
- input
embedding: gte-small
- path:
- conversation
- '*'
- output
embedding: gte-small
settings:
ui:
media_paths:
- - conversation
- '*'
- input
- - conversation
- '*'
- output
tags:
- datasets
- namespace: lilac
name: glaive-code-assistant
source:
dataset_name: glaiveai/glaive-code-assistant
source_name: huggingface
embeddings:
- path: question
embedding: gte-small
- path: answer
embedding: gte-small
settings:
ui:
media_paths:
- question
- answer
tags:
- datasets
- namespace: lilac
name: glaive-function-calling-v2
source:
dataset_name: lilacai/glaive-function-calling-v2-sharegpt
source_name: huggingface
embeddings:
- path:
- conversations
- '*'
- value
embedding: gte-small
settings:
ui:
media_paths:
- - conversations
- '*'
- value
tags:
- datasets
- namespace: lilac
name: open-assistant-conversations-2
source:
dataset_name: OpenAssistant/oasst2
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
settings:
ui:
media_paths:
- text
tags:
- datasets
- namespace: lilac
name: lmsys-chat-1m
source:
dataset_name: lmsys/lmsys-chat-1m
source_name: huggingface
embeddings:
- path:
- conversation
- '*'
- content
embedding: gte-small
settings:
ui:
media_paths:
- - conversation
- '*'
- content
tags:
- logs
- namespace: lilac
name: OpenOrca
source:
dataset_name: Open-Orca/OpenOrca
source_name: huggingface
embeddings:
- path: question
embedding: gte-small
settings:
ui:
media_paths:
- question
- response
tags:
- datasets
- namespace: lilac
name: SlimOrca
source:
dataset_name: Open-Orca/SlimOrca
source_name: huggingface
embeddings:
- path:
- conversations
- '*'
- value
embedding: gte-small
settings:
ui:
media_paths:
- - conversations
- '*'
- value
tags:
- datasets
- namespace: lilac
name: UltraChat-200k
source:
dataset_name: HuggingFaceH4/ultrachat_200k
source_name: huggingface
settings:
ui:
media_paths:
- - messages
- '*'
- content
tags:
- datasets
- namespace: lilac
name: roblox_luau_corpus
source:
dataset_name: Roblox/luau_corpus
source_name: huggingface
embeddings:
- path: prompt
embedding: gte-small
- path: completion
embedding: gte-small
settings:
ui:
media_paths:
- prompt
- completion
tags:
- datasets
- namespace: lilac
name: hncomments-1m
source:
filepaths:
- /Users/brian/dev/lilac/data/datasets/local/hncomments-duckprogress/data-00000-of-00001.parquet
sample_size: 1000000
source_name: parquet
embeddings:
- path: text
embedding: gte-small
settings:
ui:
media_paths:
- text
tags:
- datasets
- namespace: lilac
name: MMLU
source:
dataset_name: cais/mmlu
config_name: all
source_name: huggingface
settings:
ui:
media_paths:
- question
- - choices
- '*'
- answer
tags:
- eval
- namespace: lilac
name: ARC-Easy
source:
dataset_name: allenai/ai2_arc
config_name: ARC-Easy
source_name: huggingface
settings:
ui:
media_paths:
- question
- - choices
- text
- '*'
- answerKey
tags:
- eval
- namespace: lilac
name: ARC-Challenge
source:
dataset_name: allenai/ai2_arc
config_name: ARC-Challenge
source_name: huggingface
settings:
ui:
media_paths:
- question
- - choices
- text
- '*'
- answerKey
tags:
- eval
- namespace: lilac
name: HellaSwag
source:
dataset_name: Rowan/hellaswag
source_name: huggingface
settings:
ui:
media_paths:
- ctx
- ctx_a
- ctx_b
- - endings
- '*'
tags:
- eval
- namespace: lilac
name: HumanEval
source:
dataset_name: openai_humaneval
source_name: huggingface
settings:
ui:
media_paths:
- prompt
- canonical_solution
- test
tags:
- eval
- namespace: lilac
name: mbpp
source:
dataset_name: mbpp
source_name: huggingface
settings:
ui:
media_paths:
- code
- text
tags:
- eval
- namespace: lilac
name: TruthfulQA-MultipleChoice
source:
dataset_name: truthful_qa
config_name: multiple_choice
source_name: huggingface
settings:
ui:
media_paths:
- question
- - mc1_targets
- choices
- '*'
- - mc2_targets
- choices
- '*'
tags:
- eval
- namespace: lilac
name: TruthfulQA-Generation
source:
dataset_name: truthful_qa
config_name: generation
source_name: huggingface
settings:
ui:
media_paths:
- question
- - correct_answers
- '*'
- - incorrect_answers
- '*'
- source
tags:
- eval
- namespace: lilac
name: GSM8K-main
source:
dataset_name: gsm8k
config_name: main
source_name: huggingface
settings:
ui:
media_paths:
- question
- answer
tags:
- eval
- namespace: lilac
name: GSM8K-socratic
source:
dataset_name: gsm8k
config_name: socratic
source_name: huggingface
settings:
ui:
media_paths:
- question
- answer
tags:
- eval
- namespace: lilac
name: WinoGrande
source:
dataset_name: winogrande
config_name: winogrande_xl
source_name: huggingface
settings:
ui:
media_paths:
- sentence
- option1
- option2
- answer
tags:
- eval
- namespace: lilac
name: databricks-dolly-15k-curated-en
source:
dataset_name: argilla/databricks-dolly-15k-curated-en
source_name: huggingface
embeddings:
- path: original-instruction
embedding: gte-small
- path: original-context
embedding: gte-small
- path: original-response
embedding: gte-small
settings:
ui:
media_paths:
- original-instruction
- original-context
- original-response
- - new-instruction
- value
- '*'
- - new-context
- value
- '*'
- - new-response
- value
- '*'
tags:
- datasets
- namespace: lilac
name: mosaic-instruct-v3
source:
dataset_name: mosaicml/instruct-v3
source_name: huggingface
embeddings:
- path: prompt
embedding: gte-small
settings:
ui:
media_paths:
- prompt
- response
tags:
- datasets
- namespace: lilac
name: dolphin
source:
dataset_name: cognitivecomputations/dolphin
config_name: flan1m-alpaca-uncensored
source_name: huggingface
embeddings:
- path: instruction
embedding: gte-small
settings:
ui:
media_paths:
- instruction
- input
- output
tags:
- datasets
use_garden: true
signals:
- signal_name: text_statistics
- signal_name: lang_detection
concept_model_cache_embeddings:
- gte-small
- gte-base
- sbert
- openai
- cohere
clusters:
- dataset_namespace: lilac
dataset_name: Capybara
input_path: !!python/tuple
- conversation
- '*'
- input
- dataset_namespace: lilac
dataset_name: glaive-code-assistant
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: glaive-function-calling-v2
input_selector:
format: sharegpt
selector: human
output_path: !!python/tuple
- conversation_clusters
- dataset_namespace: lilac
dataset_name: open-assistant-conversations-2
input_path: !!python/tuple
- text
- dataset_namespace: lilac
dataset_name: lmsys-chat-1m
input_selector:
format: openai_conversation_json
selector: user
output_path: !!python/tuple
- conversation__clusters
- dataset_namespace: lilac
dataset_name: OpenOrca
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: SlimOrca
input_selector:
format: sharegpt
selector: human
output_path: !!python/tuple
- conversation__clusters
- dataset_namespace: lilac
dataset_name: databricks-dolly-15k-curated-en
input_path: !!python/tuple
- original-instruction
- dataset_namespace: lilac
dataset_name: mosaic-instruct-v3
input_path: !!python/tuple
- prompt
- dataset_namespace: lilac
dataset_name: dolphin
input_path: !!python/tuple
- input
- dataset_namespace: lilac
dataset_name: UltraChat-200k
input_selector:
format: openai_json
selector: user
output_path: !!python/tuple
- messages__clusters
- dataset_namespace: lilac
dataset_name: roblox_luau_corpus
input_path: !!python/tuple
- prompt
- dataset_namespace: lilac
dataset_name: roblox_luau_corpus
input_path: !!python/tuple
- completion
- dataset_namespace: lilac
dataset_name: MMLU
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: ARC-Easy
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: ARC-Challenge
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: HellaSwag
input_path: !!python/tuple
- ctx
- dataset_namespace: lilac
dataset_name: HumanEval
input_path: !!python/tuple
- prompt
- dataset_namespace: lilac
dataset_name: mbpp
input_path: !!python/tuple
- text
- dataset_namespace: lilac
dataset_name: TruthfulQA-Generation
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: TruthfulQA-MultipleChoice
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: GSM8K-main
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: GSM8K-socratic
input_path: !!python/tuple
- question
- dataset_namespace: lilac
dataset_name: WinoGrande
input_path: !!python/tuple
- sentence