RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype: float key.dtype: float and value.dtype: c10::Half instead.

#5
by Talha - opened

I am trying to use falcon with langhcain
this is what I am doing
model

model_id='falcon-7b'
tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True )
model = AutoModelForCausalLM.from_pretrained(model_id,trust_remote_code=True,

                                             quantization_config = BitsAndBytesConfig(load_in_4bit=True, 
                                                                        bnb_4bit_compute_dtype=torch.bfloat16),
                                             device_map='auto')
pipe = pipeline(
        "text-generation",
        model=model, 
        tokenizer=tokenizer, 
        max_new_tokens=256
    )

local_llm = HuggingFacePipeline(pipeline=pipe)

embeddings

hfemb = HuggingFaceEmbeddings()
retriever = vector_db.as_retriever()

memory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True,output_key='answer')
chain = ConversationalRetrievalChain.from_llm(local_llm,
                                           retriever=retriever, memory=memory,
                                          chain_type="map_reduce", #
                                          return_source_documents=True)

The code worked with flan-t5 but give error with falcon,

Here is full error

β”‚ [Errno 2] No such file or directory: '/tmp/ipykernel_67349/1702757523.py'                        β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:140 in __call__           β”‚
β”‚                                                                                                  β”‚
β”‚   137 β”‚   β”‚   β”‚   )                                                                              β”‚
β”‚   138 β”‚   β”‚   except (KeyboardInterrupt, Exception) as e:                                        β”‚
β”‚   139 β”‚   β”‚   β”‚   run_manager.on_chain_error(e)                                                  β”‚
β”‚ ❱ 140 β”‚   β”‚   β”‚   raise e                                                                        β”‚
β”‚   141 β”‚   β”‚   run_manager.on_chain_end(outputs)                                                  β”‚
β”‚   142 β”‚   β”‚   return self.prep_outputs(inputs, outputs, return_only_outputs)                     β”‚
β”‚   143                                                                                            β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:134 in __call__           β”‚
β”‚                                                                                                  β”‚
β”‚   131 β”‚   β”‚   )                                                                                  β”‚
β”‚   132 β”‚   β”‚   try:                                                                               β”‚
β”‚   133 β”‚   β”‚   β”‚   outputs = (                                                                    β”‚
β”‚ ❱ 134 β”‚   β”‚   β”‚   β”‚   self._call(inputs, run_manager=run_manager)                                β”‚
β”‚   135 β”‚   β”‚   β”‚   β”‚   if new_arg_supported                                                       β”‚
β”‚   136 β”‚   β”‚   β”‚   β”‚   else self._call(inputs)                                                    β”‚
β”‚   137 β”‚   β”‚   β”‚   )                                                                              β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/conversational_retrieval/base.py: β”‚
β”‚ 110 in _call                                                                                     β”‚
β”‚                                                                                                  β”‚
β”‚   107 β”‚   β”‚   new_inputs = inputs.copy()                                                         β”‚
β”‚   108 β”‚   β”‚   new_inputs["question"] = new_question                                              β”‚
β”‚   109 β”‚   β”‚   new_inputs["chat_history"] = chat_history_str                                      β”‚
β”‚ ❱ 110 β”‚   β”‚   answer = self.combine_docs_chain.run(                                              β”‚
β”‚   111 β”‚   β”‚   β”‚   input_documents=docs, callbacks=_run_manager.get_child(), **new_inputs         β”‚
β”‚   112 β”‚   β”‚   )                                                                                  β”‚
β”‚   113 β”‚   β”‚   if self.return_source_documents:                                                   β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:239 in run                β”‚
β”‚                                                                                                  β”‚
β”‚   236 β”‚   β”‚   β”‚   return self(args[0], callbacks=callbacks)[self.output_keys[0]]                 β”‚
β”‚   237 β”‚   β”‚                                                                                      β”‚
β”‚   238 β”‚   β”‚   if kwargs and not args:                                                            β”‚
β”‚ ❱ 239 β”‚   β”‚   β”‚   return self(kwargs, callbacks=callbacks)[self.output_keys[0]]                  β”‚
β”‚   240 β”‚   β”‚                                                                                      β”‚
β”‚   241 β”‚   β”‚   if not kwargs and not args:                                                        β”‚
β”‚   242 β”‚   β”‚   β”‚   raise ValueError(                                                              β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:140 in __call__           β”‚
β”‚                                                                                                  β”‚
β”‚   137 β”‚   β”‚   β”‚   )                                                                              β”‚
β”‚   138 β”‚   β”‚   except (KeyboardInterrupt, Exception) as e:                                        β”‚
β”‚   139 β”‚   β”‚   β”‚   run_manager.on_chain_error(e)                                                  β”‚
β”‚ ❱ 140 β”‚   β”‚   β”‚   raise e                                                                        β”‚
β”‚   141 β”‚   β”‚   run_manager.on_chain_end(outputs)                                                  β”‚
β”‚   142 β”‚   β”‚   return self.prep_outputs(inputs, outputs, return_only_outputs)                     β”‚
β”‚   143                                                                                            β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:134 in __call__           β”‚
β”‚                                                                                                  β”‚
β”‚   131 β”‚   β”‚   )                                                                                  β”‚
β”‚   132 β”‚   β”‚   try:                                                                               β”‚
β”‚   133 β”‚   β”‚   β”‚   outputs = (                                                                    β”‚
β”‚ ❱ 134 β”‚   β”‚   β”‚   β”‚   self._call(inputs, run_manager=run_manager)                                β”‚
β”‚   135 β”‚   β”‚   β”‚   β”‚   if new_arg_supported                                                       β”‚
β”‚   136 β”‚   β”‚   β”‚   β”‚   else self._call(inputs)                                                    β”‚
β”‚   137 β”‚   β”‚   β”‚   )                                                                              β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/combine_documents/base.py:84 in   β”‚
β”‚ _call                                                                                            β”‚
β”‚                                                                                                  β”‚
β”‚    81 β”‚   β”‚   docs = inputs[self.input_key]                                                      β”‚
β”‚    82 β”‚   β”‚   # Other keys are assumed to be needed for LLM prediction                           β”‚
β”‚    83 β”‚   β”‚   other_keys = {k: v for k, v in inputs.items() if k != self.input_key}              β”‚
β”‚ ❱  84 β”‚   β”‚   output, extra_return_dict = self.combine_docs(                                     β”‚
β”‚    85 β”‚   β”‚   β”‚   docs, callbacks=_run_manager.get_child(), **other_keys                         β”‚
β”‚    86 β”‚   β”‚   )                                                                                  β”‚
β”‚    87 β”‚   β”‚   extra_return_dict[self.output_key] = output                                        β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/combine_documents/map_reduce.py:1 β”‚
β”‚ 44 in combine_docs                                                                               β”‚
β”‚                                                                                                  β”‚
β”‚   141 β”‚   β”‚   Combine by mapping first chain over all documents, then reducing the results.      β”‚
β”‚   142 β”‚   β”‚   This reducing can be done recursively if needed (if there are many documents).     β”‚
β”‚   143 β”‚   β”‚   """                                                                                β”‚
β”‚ ❱ 144 β”‚   β”‚   results = self.llm_chain.apply(                                                    β”‚
β”‚   145 β”‚   β”‚   β”‚   # FYI - this is parallelized and so it is fast.                                β”‚
β”‚   146 β”‚   β”‚   β”‚   [{self.document_variable_name: d.page_content, **kwargs} for d in docs],       β”‚
β”‚   147 β”‚   β”‚   β”‚   callbacks=callbacks,                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/llm.py:157 in apply               β”‚
β”‚                                                                                                  β”‚
β”‚   154 β”‚   β”‚   β”‚   response = self.generate(input_list, run_manager=run_manager)                  β”‚
β”‚   155 β”‚   β”‚   except (KeyboardInterrupt, Exception) as e:                                        β”‚
β”‚   156 β”‚   β”‚   β”‚   run_manager.on_chain_error(e)                                                  β”‚
β”‚ ❱ 157 β”‚   β”‚   β”‚   raise e                                                                        β”‚
β”‚   158 β”‚   β”‚   outputs = self.create_outputs(response)                                            β”‚
β”‚   159 β”‚   β”‚   run_manager.on_chain_end({"outputs": outputs})                                     β”‚
β”‚   160 β”‚   β”‚   return outputs                                                                     β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/llm.py:154 in apply               β”‚
β”‚                                                                                                  β”‚
β”‚   151 β”‚   β”‚   β”‚   {"input_list": input_list},                                                    β”‚
β”‚   152 β”‚   β”‚   )                                                                                  β”‚
β”‚   153 β”‚   β”‚   try:                                                                               β”‚
β”‚ ❱ 154 β”‚   β”‚   β”‚   response = self.generate(input_list, run_manager=run_manager)                  β”‚
β”‚   155 β”‚   β”‚   except (KeyboardInterrupt, Exception) as e:                                        β”‚
β”‚   156 β”‚   β”‚   β”‚   run_manager.on_chain_error(e)                                                  β”‚
β”‚   157 β”‚   β”‚   β”‚   raise e                                                                        β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/chains/llm.py:79 in generate             β”‚
β”‚                                                                                                  β”‚
β”‚    76 β”‚   ) -> LLMResult:                                                                        β”‚
β”‚    77 β”‚   β”‚   """Generate LLM result from inputs."""                                             β”‚
β”‚    78 β”‚   β”‚   prompts, stop = self.prep_prompts(input_list, run_manager=run_manager)             β”‚
β”‚ ❱  79 β”‚   β”‚   return self.llm.generate_prompt(                                                   β”‚
β”‚    80 β”‚   β”‚   β”‚   prompts, stop, callbacks=run_manager.get_child() if run_manager else None      β”‚
β”‚    81 β”‚   β”‚   )                                                                                  β”‚
β”‚    82                                                                                            β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/llms/base.py:134 in generate_prompt      β”‚
β”‚                                                                                                  β”‚
β”‚   131 β”‚   β”‚   callbacks: Callbacks = None,                                                       β”‚
β”‚   132 β”‚   ) -> LLMResult:                                                                        β”‚
β”‚   133 β”‚   β”‚   prompt_strings = [p.to_string() for p in prompts]                                  β”‚
β”‚ ❱ 134 β”‚   β”‚   return self.generate(prompt_strings, stop=stop, callbacks=callbacks)               β”‚
β”‚   135 β”‚                                                                                          β”‚
β”‚   136 β”‚   async def agenerate_prompt(                                                            β”‚
β”‚   137 β”‚   β”‚   self,                                                                              β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/llms/base.py:191 in generate             β”‚
β”‚                                                                                                  β”‚
β”‚   188 β”‚   β”‚   β”‚   β”‚   )                                                                          β”‚
β”‚   189 β”‚   β”‚   β”‚   except (KeyboardInterrupt, Exception) as e:                                    β”‚
β”‚   190 β”‚   β”‚   β”‚   β”‚   run_manager.on_llm_error(e)                                                β”‚
β”‚ ❱ 191 β”‚   β”‚   β”‚   β”‚   raise e                                                                    β”‚
β”‚   192 β”‚   β”‚   β”‚   run_manager.on_llm_end(output)                                                 β”‚
β”‚   193 β”‚   β”‚   β”‚   return output                                                                  β”‚
β”‚   194 β”‚   β”‚   if len(missing_prompts) > 0:                                                       β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/llms/base.py:185 in generate             β”‚
β”‚                                                                                                  β”‚
β”‚   182 β”‚   β”‚   β”‚   )                                                                              β”‚
β”‚   183 β”‚   β”‚   β”‚   try:                                                                           β”‚
β”‚   184 β”‚   β”‚   β”‚   β”‚   output = (                                                                 β”‚
β”‚ ❱ 185 β”‚   β”‚   β”‚   β”‚   β”‚   self._generate(prompts, stop=stop, run_manager=run_manager)            β”‚
β”‚   186 β”‚   β”‚   β”‚   β”‚   β”‚   if new_arg_supported                                                   β”‚
β”‚   187 β”‚   β”‚   β”‚   β”‚   β”‚   else self._generate(prompts, stop=stop)                                β”‚
β”‚   188 β”‚   β”‚   β”‚   β”‚   )                                                                          β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/llms/base.py:436 in _generate            β”‚
β”‚                                                                                                  β”‚
β”‚   433 β”‚   β”‚   new_arg_supported = inspect.signature(self._call).parameters.get("run_manager")    β”‚
β”‚   434 β”‚   β”‚   for prompt in prompts:                                                             β”‚
β”‚   435 β”‚   β”‚   β”‚   text = (                                                                       β”‚
β”‚ ❱ 436 β”‚   β”‚   β”‚   β”‚   self._call(prompt, stop=stop, run_manager=run_manager)                     β”‚
β”‚   437 β”‚   β”‚   β”‚   β”‚   if new_arg_supported                                                       β”‚
β”‚   438 β”‚   β”‚   β”‚   β”‚   else self._call(prompt, stop=stop)                                         β”‚
β”‚   439 β”‚   β”‚   β”‚   )                                                                              β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/langchain/llms/huggingface_pipeline.py:159 in      β”‚
β”‚ _call                                                                                            β”‚
β”‚                                                                                                  β”‚
β”‚   156 β”‚   β”‚   stop: Optional[List[str]] = None,                                                  β”‚
β”‚   157 β”‚   β”‚   run_manager: Optional[CallbackManagerForLLMRun] = None,                            β”‚
β”‚   158 β”‚   ) -> str:                                                                              β”‚
β”‚ ❱ 159 β”‚   β”‚   response = self.pipeline(prompt)                                                   β”‚
β”‚   160 β”‚   β”‚   if self.pipeline.task == "text-generation":                                        β”‚
β”‚   161 β”‚   β”‚   β”‚   # Text generation return includes the starter text.                            β”‚
β”‚   162 β”‚   β”‚   β”‚   text = response[0]["generated_text"][len(prompt) :]                            β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/text_generation.py:201 in   β”‚
β”‚ __call__                                                                                         β”‚
β”‚                                                                                                  β”‚
β”‚   198 β”‚   β”‚   β”‚   - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `retu   β”‚
β”‚   199 β”‚   β”‚   β”‚     ids of the generated text.                                                   β”‚
β”‚   200 β”‚   β”‚   """                                                                                β”‚
β”‚ ❱ 201 β”‚   β”‚   return super().__call__(text_inputs, **kwargs)                                     β”‚
β”‚   202 β”‚                                                                                          β”‚
β”‚   203 β”‚   def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate   β”‚
β”‚   204 β”‚   β”‚   inputs = self.tokenizer(                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/base.py:1118 in __call__    β”‚
β”‚                                                                                                  β”‚
β”‚   1115 β”‚   β”‚   β”‚   β”‚   )                                                                         β”‚
β”‚   1116 β”‚   β”‚   β”‚   )                                                                             β”‚
β”‚   1117 β”‚   β”‚   else:                                                                             β”‚
β”‚ ❱ 1118 β”‚   β”‚   β”‚   return self.run_single(inputs, preprocess_params, forward_params, postproces  β”‚
β”‚   1119 β”‚                                                                                         β”‚
β”‚   1120 β”‚   def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):   β”‚
β”‚   1121 β”‚   β”‚   return [self.run_single(item, preprocess_params, forward_params, postprocess_par  β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/base.py:1125 in run_single  β”‚
β”‚                                                                                                  β”‚
β”‚   1122 β”‚                                                                                         β”‚
β”‚   1123 β”‚   def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):  β”‚
β”‚   1124 β”‚   β”‚   model_inputs = self.preprocess(inputs, **preprocess_params)                       β”‚
β”‚ ❱ 1125 β”‚   β”‚   model_outputs = self.forward(model_inputs, **forward_params)                      β”‚
β”‚   1126 β”‚   β”‚   outputs = self.postprocess(model_outputs, **postprocess_params)                   β”‚
β”‚   1127 β”‚   β”‚   return outputs                                                                    β”‚
β”‚   1128                                                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/base.py:1024 in forward     β”‚
β”‚                                                                                                  β”‚
β”‚   1021 β”‚   β”‚   β”‚   β”‚   inference_context = self.get_inference_context()                          β”‚
β”‚   1022 β”‚   β”‚   β”‚   β”‚   with inference_context():                                                 β”‚
β”‚   1023 β”‚   β”‚   β”‚   β”‚   β”‚   model_inputs = self._ensure_tensor_on_device(model_inputs, device=se  β”‚
β”‚ ❱ 1024 β”‚   β”‚   β”‚   β”‚   β”‚   model_outputs = self._forward(model_inputs, **forward_params)         β”‚
β”‚   1025 β”‚   β”‚   β”‚   β”‚   β”‚   model_outputs = self._ensure_tensor_on_device(model_outputs, device=  β”‚
β”‚   1026 β”‚   β”‚   β”‚   else:                                                                         β”‚
β”‚   1027 β”‚   β”‚   β”‚   β”‚   raise ValueError(f"Framework {self.framework} is not supported")          β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/text_generation.py:263 in   β”‚
β”‚ _forward                                                                                         β”‚
β”‚                                                                                                  β”‚
β”‚   260 β”‚   β”‚   β”‚   β”‚   generate_kwargs["min_length"] += prefix_length                             β”‚
β”‚   261 β”‚   β”‚                                                                                      β”‚
β”‚   262 β”‚   β”‚   # BS x SL                                                                          β”‚
β”‚ ❱ 263 β”‚   β”‚   generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=att   β”‚
β”‚   264 β”‚   β”‚   out_b = generated_sequence.shape[0]                                                β”‚
β”‚   265 β”‚   β”‚   if self.framework == "pt":                                                         β”‚
β”‚   266 β”‚   β”‚   β”‚   generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *genera   β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:115 in decorate_context β”‚
β”‚                                                                                                  β”‚
β”‚   112 β”‚   @functools.wraps(func)                                                                 β”‚
β”‚   113 β”‚   def decorate_context(*args, **kwargs):                                                 β”‚
β”‚   114 β”‚   β”‚   with ctx_factory():                                                                β”‚
β”‚ ❱ 115 β”‚   β”‚   β”‚   return func(*args, **kwargs)                                                   β”‚
β”‚   116 β”‚                                                                                          β”‚
β”‚   117 β”‚   return decorate_context                                                                β”‚
β”‚   118                                                                                            β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/transformers/generation/utils.py:1518 in generate  β”‚
β”‚                                                                                                  β”‚
β”‚   1515 β”‚   β”‚   β”‚   β”‚   )                                                                         β”‚
β”‚   1516 β”‚   β”‚   β”‚                                                                                 β”‚
β”‚   1517 β”‚   β”‚   β”‚   # 11. run greedy search                                                       β”‚
β”‚ ❱ 1518 β”‚   β”‚   β”‚   return self.greedy_search(                                                    β”‚
β”‚   1519 β”‚   β”‚   β”‚   β”‚   input_ids,                                                                β”‚
β”‚   1520 β”‚   β”‚   β”‚   β”‚   logits_processor=logits_processor,                                        β”‚
β”‚   1521 β”‚   β”‚   β”‚   β”‚   stopping_criteria=stopping_criteria,                                      β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/transformers/generation/utils.py:2335 in           β”‚
β”‚ greedy_search                                                                                    β”‚
β”‚                                                                                                  β”‚
β”‚   2332 β”‚   β”‚   β”‚   model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)  β”‚
β”‚   2333 β”‚   β”‚   β”‚                                                                                 β”‚
β”‚   2334 β”‚   β”‚   β”‚   # forward pass to get next token                                              β”‚
β”‚ ❱ 2335 β”‚   β”‚   β”‚   outputs = self(                                                               β”‚
β”‚   2336 β”‚   β”‚   β”‚   β”‚   **model_inputs,                                                           β”‚
β”‚   2337 β”‚   β”‚   β”‚   β”‚   return_dict=True,                                                         β”‚
β”‚   2338 β”‚   β”‚   β”‚   β”‚   output_attentions=output_attentions,                                      β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl      β”‚
β”‚                                                                                                  β”‚
β”‚   1498 β”‚   β”‚   if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks   β”‚
β”‚   1499 β”‚   β”‚   β”‚   β”‚   or _global_backward_pre_hooks or _global_backward_hooks                   β”‚
β”‚   1500 β”‚   β”‚   β”‚   β”‚   or _global_forward_hooks or _global_forward_pre_hooks):                   β”‚
β”‚ ❱ 1501 β”‚   β”‚   β”‚   return forward_call(*args, **kwargs)                                          β”‚
β”‚   1502 β”‚   β”‚   # Do not call functions when jit is used                                          β”‚
β”‚   1503 β”‚   β”‚   full_backward_hooks, non_full_backward_hooks = [], []                             β”‚
β”‚   1504 β”‚   β”‚   backward_pre_hooks = []                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/accelerate/hooks.py:165 in new_forward             β”‚
β”‚                                                                                                  β”‚
β”‚   162 β”‚   β”‚   β”‚   with torch.no_grad():                                                          β”‚
β”‚   163 β”‚   β”‚   β”‚   β”‚   output = old_forward(*args, **kwargs)                                      β”‚
β”‚   164 β”‚   β”‚   else:                                                                              β”‚
β”‚ ❱ 165 β”‚   β”‚   β”‚   output = old_forward(*args, **kwargs)                                          β”‚
β”‚   166 β”‚   β”‚   return module._hf_hook.post_forward(module, output)                                β”‚
β”‚   167 β”‚                                                                                          β”‚
β”‚   168 β”‚   module.forward = new_forward                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/.cache/huggingface/modules/transformers_modules/falcon-7b/modelling_RW.py:753 in     β”‚
β”‚ forward                                                                                          β”‚
β”‚                                                                                                  β”‚
β”‚    750 β”‚   β”‚                                                                                     β”‚
β”‚    751 β”‚   β”‚   return_dict = return_dict if return_dict is not None else self.config.use_return  β”‚
β”‚    752 β”‚   β”‚                                                                                     β”‚
β”‚ ❱  753 β”‚   β”‚   transformer_outputs = self.transformer(                                           β”‚
β”‚    754 β”‚   β”‚   β”‚   input_ids,                                                                    β”‚
β”‚    755 β”‚   β”‚   β”‚   past_key_values=past_key_values,                                              β”‚
β”‚    756 β”‚   β”‚   β”‚   attention_mask=attention_mask,                                                β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl      β”‚
β”‚                                                                                                  β”‚
β”‚   1498 β”‚   β”‚   if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks   β”‚
β”‚   1499 β”‚   β”‚   β”‚   β”‚   or _global_backward_pre_hooks or _global_backward_hooks                   β”‚
β”‚   1500 β”‚   β”‚   β”‚   β”‚   or _global_forward_hooks or _global_forward_pre_hooks):                   β”‚
β”‚ ❱ 1501 β”‚   β”‚   β”‚   return forward_call(*args, **kwargs)                                          β”‚
β”‚   1502 β”‚   β”‚   # Do not call functions when jit is used                                          β”‚
β”‚   1503 β”‚   β”‚   full_backward_hooks, non_full_backward_hooks = [], []                             β”‚
β”‚   1504 β”‚   β”‚   backward_pre_hooks = []                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/accelerate/hooks.py:165 in new_forward             β”‚
β”‚                                                                                                  β”‚
β”‚   162 β”‚   β”‚   β”‚   with torch.no_grad():                                                          β”‚
β”‚   163 β”‚   β”‚   β”‚   β”‚   output = old_forward(*args, **kwargs)                                      β”‚
β”‚   164 β”‚   β”‚   else:                                                                              β”‚
β”‚ ❱ 165 β”‚   β”‚   β”‚   output = old_forward(*args, **kwargs)                                          β”‚
β”‚   166 β”‚   β”‚   return module._hf_hook.post_forward(module, output)                                β”‚
β”‚   167 β”‚                                                                                          β”‚
β”‚   168 β”‚   module.forward = new_forward                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/.cache/huggingface/modules/transformers_modules/falcon-7b/modelling_RW.py:648 in     β”‚
β”‚ forward                                                                                          β”‚
β”‚                                                                                                  β”‚
β”‚    645 β”‚   β”‚   β”‚   β”‚   β”‚   head_mask[i],                                                         β”‚
β”‚    646 β”‚   β”‚   β”‚   β”‚   )                                                                         β”‚
β”‚    647 β”‚   β”‚   β”‚   else:                                                                         β”‚
β”‚ ❱  648 β”‚   β”‚   β”‚   β”‚   outputs = block(                                                          β”‚
β”‚    649 β”‚   β”‚   β”‚   β”‚   β”‚   hidden_states,                                                        β”‚
β”‚    650 β”‚   β”‚   β”‚   β”‚   β”‚   layer_past=layer_past,                                                β”‚
β”‚    651 β”‚   β”‚   β”‚   β”‚   β”‚   attention_mask=causal_mask,                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl      β”‚
β”‚                                                                                                  β”‚
β”‚   1498 β”‚   β”‚   if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks   β”‚
β”‚   1499 β”‚   β”‚   β”‚   β”‚   or _global_backward_pre_hooks or _global_backward_hooks                   β”‚
β”‚   1500 β”‚   β”‚   β”‚   β”‚   or _global_forward_hooks or _global_forward_pre_hooks):                   β”‚
β”‚ ❱ 1501 β”‚   β”‚   β”‚   return forward_call(*args, **kwargs)                                          β”‚
β”‚   1502 β”‚   β”‚   # Do not call functions when jit is used                                          β”‚
β”‚   1503 β”‚   β”‚   full_backward_hooks, non_full_backward_hooks = [], []                             β”‚
β”‚   1504 β”‚   β”‚   backward_pre_hooks = []                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/accelerate/hooks.py:165 in new_forward             β”‚
β”‚                                                                                                  β”‚
β”‚   162 β”‚   β”‚   β”‚   with torch.no_grad():                                                          β”‚
β”‚   163 β”‚   β”‚   β”‚   β”‚   output = old_forward(*args, **kwargs)                                      β”‚
β”‚   164 β”‚   β”‚   else:                                                                              β”‚
β”‚ ❱ 165 β”‚   β”‚   β”‚   output = old_forward(*args, **kwargs)                                          β”‚
β”‚   166 β”‚   β”‚   return module._hf_hook.post_forward(module, output)                                β”‚
β”‚   167 β”‚                                                                                          β”‚
β”‚   168 β”‚   module.forward = new_forward                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/.cache/huggingface/modules/transformers_modules/falcon-7b/modelling_RW.py:385 in     β”‚
β”‚ forward                                                                                          β”‚
β”‚                                                                                                  β”‚
β”‚    382 β”‚   β”‚   residual = hidden_states                                                          β”‚
β”‚    383 β”‚   β”‚                                                                                     β”‚
β”‚    384 β”‚   β”‚   # Self attention.                                                                 β”‚
β”‚ ❱  385 β”‚   β”‚   attn_outputs = self.self_attention(                                               β”‚
β”‚    386 β”‚   β”‚   β”‚   layernorm_output,                                                             β”‚
β”‚    387 β”‚   β”‚   β”‚   layer_past=layer_past,                                                        β”‚
β”‚    388 β”‚   β”‚   β”‚   attention_mask=attention_mask,                                                β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl      β”‚
β”‚                                                                                                  β”‚
β”‚   1498 β”‚   β”‚   if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks   β”‚
β”‚   1499 β”‚   β”‚   β”‚   β”‚   or _global_backward_pre_hooks or _global_backward_hooks                   β”‚
β”‚   1500 β”‚   β”‚   β”‚   β”‚   or _global_forward_hooks or _global_forward_pre_hooks):                   β”‚
β”‚ ❱ 1501 β”‚   β”‚   β”‚   return forward_call(*args, **kwargs)                                          β”‚
β”‚   1502 β”‚   β”‚   # Do not call functions when jit is used                                          β”‚
β”‚   1503 β”‚   β”‚   full_backward_hooks, non_full_backward_hooks = [], []                             β”‚
β”‚   1504 β”‚   β”‚   backward_pre_hooks = []                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/venv/lib/python3.10/site-packages/accelerate/hooks.py:165 in new_forward             β”‚
β”‚                                                                                                  β”‚
β”‚   162 β”‚   β”‚   β”‚   with torch.no_grad():                                                          β”‚
β”‚   163 β”‚   β”‚   β”‚   β”‚   output = old_forward(*args, **kwargs)                                      β”‚
β”‚   164 β”‚   β”‚   else:                                                                              β”‚
β”‚ ❱ 165 β”‚   β”‚   β”‚   output = old_forward(*args, **kwargs)                                          β”‚
β”‚   166 β”‚   β”‚   return module._hf_hook.post_forward(module, output)                                β”‚
β”‚   167 β”‚                                                                                          β”‚
β”‚   168 β”‚   module.forward = new_forward                                                           β”‚
β”‚                                                                                                  β”‚
β”‚ /home/talha/.cache/huggingface/modules/transformers_modules/falcon-7b/modelling_RW.py:279 in     β”‚
β”‚ forward                                                                                          β”‚
β”‚                                                                                                  β”‚
β”‚    276 β”‚   β”‚   β”‚   key_layer_ = key_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)    β”‚
β”‚    277 β”‚   β”‚   β”‚   value_layer_ = value_layer.reshape(batch_size, self.num_kv, -1, self.head_di  β”‚
β”‚    278 β”‚   β”‚   β”‚                                                                                 β”‚
β”‚ ❱  279 β”‚   β”‚   β”‚   attn_output = F.scaled_dot_product_attention(                                 β”‚
β”‚    280 β”‚   β”‚   β”‚   β”‚   query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True         β”‚
β”‚    281 β”‚   β”‚   β”‚   )                                                                             β”‚
β”‚    282                                                                                           β”‚
╰──────────────────────────────────────────────────────────────────────────────────────────────
Technology Innovation Institute org

Thanks for the report, should be resolved with: https://huggingface.co/tiiuae/falcon-7b/commit/1ba2370c784b56f8b31afc66d5234e8fb40a7209.
Though note that inference with datatypes other than bfloat16 has not been fully validated and may incur some model degradation.

Let us know if you have any more issues.

FalconLLM changed discussion status to closed

Sign up or log in to comment