nicholasKluge commited on
Commit
7f62425
1 Parent(s): da7c3ac

Upload 7 files

Browse files
app.py CHANGED
@@ -19,7 +19,7 @@ completion_tfidf_vectorizer = joblib.load('completion-vectorizer.pkl')
19
  completion_tfidf_matrix = joblib.load('completion-tfidf-matrix.pkl')
20
 
21
  hub_token = os.environ.get("HUB_TOKEN")
22
- model_id = "nicholasKluge/TeenyTinyLlama-160m-Chat"
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
 
25
  model = AutoModelForCausalLM.from_pretrained(model_id, token=hub_token)
@@ -31,7 +31,7 @@ model.to(device)
31
  intro = """
32
  O TeenyTinyLlama é um modelo de linguagem compacto baseado na arquitetura Llama 2 ([TinyLlama implementation](https://huggingface.co/TinyLlama)).Esse modelo foi projetado para oferecer recursos eficientes de processamento de linguagem natural e, ao mesmo tempo, consumir poucos recursos. Esses modelos foram treinados aproveitando as [leis de escalonamento](https://arxiv.org/abs/2203.15556) para determinar o número ideal de tokens por parâmetro e incorporando o [pré-treinamento de preferências](https://arxiv.org/abs/2112.00861).
33
 
34
- Esse repositório contém uma versão de [TeenyTinyLlama-160m](https://huggingface.co/nicholasKluge/TeenyTinyLlama-160m) (`TeenyTinyLlama-160m-Chat`) afinada no [Instruct-Aira Dataset version 2.0](https://huggingface.co/datasets/nicholasKluge/instruct-aira-dataset-v2).
35
 
36
  ## Limitações
37
 
@@ -68,7 +68,7 @@ Se desejar apresentar uma reclamação sobre qualquer mensagem produzida, por fa
68
 
69
  with gr.Blocks(theme='freddyaboulton/dracula_revamped') as demo:
70
 
71
- gr.Markdown("""<h1><center>TeenyTinyLlama-160m-Chat 🦙💬</h1></center>""")
72
  gr.Markdown(intro)
73
 
74
 
 
19
  completion_tfidf_matrix = joblib.load('completion-tfidf-matrix.pkl')
20
 
21
  hub_token = os.environ.get("HUB_TOKEN")
22
+ model_id = "nicholasKluge/TeenyTinyLlama-460m-Chat"
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
 
25
  model = AutoModelForCausalLM.from_pretrained(model_id, token=hub_token)
 
31
  intro = """
32
  O TeenyTinyLlama é um modelo de linguagem compacto baseado na arquitetura Llama 2 ([TinyLlama implementation](https://huggingface.co/TinyLlama)).Esse modelo foi projetado para oferecer recursos eficientes de processamento de linguagem natural e, ao mesmo tempo, consumir poucos recursos. Esses modelos foram treinados aproveitando as [leis de escalonamento](https://arxiv.org/abs/2203.15556) para determinar o número ideal de tokens por parâmetro e incorporando o [pré-treinamento de preferências](https://arxiv.org/abs/2112.00861).
33
 
34
+ Esse repositório contém uma versão de [TeenyTinyLlama-460m](https://huggingface.co/nicholasKluge/TeenyTinyLlama-460m) (`TeenyTinyLlama-460m-Chat`) afinada no [Instruct-Aira Dataset version 2.0](https://huggingface.co/datasets/nicholasKluge/instruct-aira-dataset-v2).
35
 
36
  ## Limitações
37
 
 
68
 
69
  with gr.Blocks(theme='freddyaboulton/dracula_revamped') as demo:
70
 
71
+ gr.Markdown("""<h1><center>TeenyTinyLlama-Chat 🦙💬</h1></center>""")
72
  gr.Markdown(intro)
73
 
74
 
completion-vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64bbd49731efda1677dca239d838058ef221de4770ed5663e451af1347c30637
3
  size 4036115
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add850bde149e5de855d3c0334cd99ef5055289f8d103626250db2b5a1bbd0dc
3
  size 4036115
completion_tfidf-matrix.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16f7cb342da64a6334bb035d162a29579853926af2243c14029fb5043d4fbd81
3
+ size 116328867
create-tfidf-matrix.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import pandas as pd
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ import argparse
6
+
7
+ def main():
8
+ parser = argparse.ArgumentParser(description='Process some integers.')
9
+ parser.add_argument('--input', type=str, help="Input file path (file should be in parquet format and have 'prompt' and 'completion' columns)")
10
+ parser.add_argument('--output', type=str, help='Output file path')
11
+ args = parser.parse_args()
12
+
13
+ df = pd.read_parquet(args.input)
14
+
15
+ # fit the vectorizer on the prompt column
16
+ prompt_tfidf_vectorizer = TfidfVectorizer()
17
+ prompt_tfidf_vectorizer.fit(df['prompt'])
18
+
19
+ # save the vectorizer
20
+ joblib.dump(prompt_tfidf_vectorizer, args.output + 'prompt-vectorizer.pkl')
21
+
22
+ # get the tfidf_matrix
23
+ prompt_tfidf_matrix = prompt_tfidf_vectorizer.transform(df['prompt'])
24
+
25
+ # save the tfidf_matrix
26
+ joblib.dump(prompt_tfidf_matrix, args.output + 'prompt-tfidf_matrix.pkl')
27
+
28
+ # fit the vectorizer on the completion column
29
+ completion_tfidf_vectorizer = TfidfVectorizer()
30
+ completion_tfidf_vectorizer.fit(df['completion'])
31
+
32
+ # save the vectorizer
33
+ joblib.dump(completion_tfidf_vectorizer, args.output + 'completion-vectorizer.pkl')
34
+
35
+ # get the tfidf_matrix
36
+ completion_tfidf_matrix = completion_tfidf_vectorizer.transform(df['completion'])
37
+
38
+ # save the tfidf_matrix
39
+ joblib.dump(completion_tfidf_matrix, args.output + 'completion_tfidf-matrix.pkl')
40
+
41
+ print("Done!")
42
+
43
+ if __name__ == '__main__':
44
+ main()
45
+
46
+ # example usage: python create-tfidf-matrix.py --input fine-tuning-data.parquet --output ./
fine-tuning-data.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0eb2d74b25cf773e7a2edbcdc3d05818e80eaa494d124e4deb5820158958d7c
3
- size 89839763
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb05796faee1bfad1857780ee76ce08655bbc44c8d1391325b6bddd638f9d99
3
+ size 89835929
prompt-tfidf_matrix.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95d8c1d302b36e5fef3da79e802354972158b247051715c98d55f351b8993fe2
3
+ size 37977659
prompt-vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29a72f7e1c286ffe7c87c5384484f5471baec4b3a86238b458fdbcfb52d01a38
3
  size 3324940
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:739df119b82ef1f2d8dfd4d85bc1ee489d2705b48d1bd701627df9222e15cc8f
3
  size 3324940