codelion commited on
Commit
33012a5
1 Parent(s): 196f122

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -0
app.py CHANGED
@@ -7,6 +7,14 @@ import spaces
7
  import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
9
 
 
 
 
 
 
 
 
 
10
  MAX_MAX_NEW_TOKENS = 1024
11
  DEFAULT_MAX_NEW_TOKENS = 512
12
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
7
  import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
9
 
10
+ import subprocess
11
+ # Install flash attention, skipping CUDA build if necessary
12
+ subprocess.run(
13
+ "pip install flash-attn --no-build-isolation",
14
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
15
+ shell=True,
16
+ )
17
+
18
  MAX_MAX_NEW_TOKENS = 1024
19
  DEFAULT_MAX_NEW_TOKENS = 512
20
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))