meta_keys = [ """ - dup_signals: relevant information related to dedup metrics - source: url to download - file: id to identify PMC files - Corpus id: id to identify corpus - Openaccessinfo: License/doi information - Pmid: pubmed id - Title: title of the paper, mainly for philpapers dataset - Type: type of data, mainly for philpapers dataset - Creator: creator of the data, mainly for philpapers dataset - Subject: subject of the data, mainly for philpapers dataset - Date: date of the data, mainly for philpapers dataset - Identifier: identifier of the data, mainly for philpapers dataset - Description: description of the data, mainly for philpapers dataset - Datestamp: datestamp of the data, mainly for philpapers dataset """, #papers """ - dup_signals: relevant information related to dedup metrics - url: wikipedia link - title: title of the article - language: language of the article """, #wikipedia """ - dup_signals: relevant information related to dedup metrics - source: which stackexchange it is coming from, example: pt.stackoverflow - answer_score: list of scores of answer in same order as answer in text - comment_score: list of scores of comment in same order as comment in text """, #Stackexchange """ - dup_signals: relevant information related to dedup metrics - language: language of the text """, #europarl """ - dup_signals: relevant information related to dedup metrics - Channel: channel of the irc - month: month of the post """, #ubuntu irc """ - dup_signals: relevant information related to dedup metrics - Id: Unique indetifier of the post """, #hackernews """ - dup_signals: relevant information related to dedup metrics - short_book_title: short title of the book - publication_date: publication date of the book - url: url of the book """, #pg19 """ - dup_signals: relevant information related to dedup metrics - bibliographic_information: bibliographic information, may contain title - source_file: link of the source file - Abstract: abstract of the document - Citations: list of citations - Assignees: details about inventors - Classification: classification of the document - Inventors: details about inventors """, #uspto """ - dup_signals: relevant information related to dedup metrics """, #freelaw """ - dup_signals: relevant information related to dedup metrics - source: field of maths """, #dmmaths ] bucket_info = [ """ """, #papers """ """, #wikipedia """ """, #Stackexchange """ """, #europarl """ """, #ubuntu irc """ """, #hackernews """ """, #pg19 """ """, #uspto """ """, #freelaw """ """, #dmmaths ]