Update README.md
Browse files
README.md
CHANGED
@@ -27,6 +27,11 @@ It achieves the following results on the evaluation set:
|
|
27 |
|
28 |
## Sample Code
|
29 |
|
|
|
|
|
|
|
|
|
|
|
30 |
### Test Dataset
|
31 |
If you prefer, you can use test dataset from [zelalt/scientific-papers](https://huggingface.co/datasets/zelalt/scientific-papers)
|
32 |
or [zelalt/arxiv-papers](https://huggingface.co/datasets/zelalt/arxiv-papers) or read your pdf as text with PyPDF2.PdfReader then give this text to LLM with adding "What is the title of this paper?" prompt.
|
@@ -57,16 +62,16 @@ model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, tru
|
|
57 |
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,trust_remote_code=True)
|
58 |
model = PeftModel.from_pretrained(model, peft_model_id)
|
59 |
|
60 |
-
#Put
|
61 |
-
inputs = tokenizer(f'''
|
62 |
outputs = model.generate(**inputs,max_new_tokens=50, pad_token_id = tokenizer.eos_token_id, eos_token_id = tokenizer.eos_token_id)
|
63 |
text = tokenizer.batch_decode(outputs)[0]
|
64 |
print(text)
|
65 |
```
|
66 |
|
67 |
```python
|
68 |
-
#Put
|
69 |
-
inputs = tokenizer(f'''
|
70 |
outputs = model.generate(**inputs,max_new_tokens=50, pad_token_id = tokenizer.eos_token_id, eos_token_id = tokenizer.eos_token_id)
|
71 |
text = tokenizer.batch_decode(outputs)[0]
|
72 |
print(text)
|
|
|
27 |
|
28 |
## Sample Code
|
29 |
|
30 |
+
### Requirements
|
31 |
+
```python
|
32 |
+
!pip install accelerate transformers einops datasets peft bitsandbytes
|
33 |
+
```
|
34 |
+
|
35 |
### Test Dataset
|
36 |
If you prefer, you can use test dataset from [zelalt/scientific-papers](https://huggingface.co/datasets/zelalt/scientific-papers)
|
37 |
or [zelalt/arxiv-papers](https://huggingface.co/datasets/zelalt/arxiv-papers) or read your pdf as text with PyPDF2.PdfReader then give this text to LLM with adding "What is the title of this paper?" prompt.
|
|
|
62 |
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,trust_remote_code=True)
|
63 |
model = PeftModel.from_pretrained(model, peft_model_id)
|
64 |
|
65 |
+
#Put from dataset
|
66 |
+
inputs = tokenizer(f'''{formatted_dataset['text'][120]}''', return_tensors="pt", return_attention_mask=False)
|
67 |
outputs = model.generate(**inputs,max_new_tokens=50, pad_token_id = tokenizer.eos_token_id, eos_token_id = tokenizer.eos_token_id)
|
68 |
text = tokenizer.batch_decode(outputs)[0]
|
69 |
print(text)
|
70 |
```
|
71 |
|
72 |
```python
|
73 |
+
#Put as string
|
74 |
+
inputs = tokenizer(f'''What is the title of this paper? ...[your pdf as text]..\n\nAnswer: ''', return_tensors="pt", return_attention_mask=False)
|
75 |
outputs = model.generate(**inputs,max_new_tokens=50, pad_token_id = tokenizer.eos_token_id, eos_token_id = tokenizer.eos_token_id)
|
76 |
text = tokenizer.batch_decode(outputs)[0]
|
77 |
print(text)
|