RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype: float key.dtype: float and value.dtype: c10::Half instead.
#5
by
Talha
- opened
I am trying to use falcon with langhcain
this is what I am doing
model
model_id='falcon-7b'
tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True )
model = AutoModelForCausalLM.from_pretrained(model_id,trust_remote_code=True,
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16),
device_map='auto')
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=256
)
local_llm = HuggingFacePipeline(pipeline=pipe)
embeddings
hfemb = HuggingFaceEmbeddings()
retriever = vector_db.as_retriever()
memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True,output_key='answer')
chain = ConversationalRetrievalChain.from_llm(local_llm,
retriever=retriever, memory=memory,
chain_type="map_reduce", #
return_source_documents=True)
The code worked with flan-t5 but give error with falcon,
Here is full error
β [Errno 2] No such file or directory: '/tmp/ipykernel_67349/1702757523.py' β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:140 in __call__ β
β β
β 137 β β β ) β
β 138 β β except (KeyboardInterrupt, Exception) as e: β
β 139 β β β run_manager.on_chain_error(e) β
β β± 140 β β β raise e β
β 141 β β run_manager.on_chain_end(outputs) β
β 142 β β return self.prep_outputs(inputs, outputs, return_only_outputs) β
β 143 β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:134 in __call__ β
β β
β 131 β β ) β
β 132 β β try: β
β 133 β β β outputs = ( β
β β± 134 β β β β self._call(inputs, run_manager=run_manager) β
β 135 β β β β if new_arg_supported β
β 136 β β β β else self._call(inputs) β
β 137 β β β ) β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/conversational_retrieval/base.py: β
β 110 in _call β
β β
β 107 β β new_inputs = inputs.copy() β
β 108 β β new_inputs["question"] = new_question β
β 109 β β new_inputs["chat_history"] = chat_history_str β
β β± 110 β β answer = self.combine_docs_chain.run( β
β 111 β β β input_documents=docs, callbacks=_run_manager.get_child(), **new_inputs β
β 112 β β ) β
β 113 β β if self.return_source_documents: β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:239 in run β
β β
β 236 β β β return self(args[0], callbacks=callbacks)[self.output_keys[0]] β
β 237 β β β
β 238 β β if kwargs and not args: β
β β± 239 β β β return self(kwargs, callbacks=callbacks)[self.output_keys[0]] β
β 240 β β β
β 241 β β if not kwargs and not args: β
β 242 β β β raise ValueError( β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:140 in __call__ β
β β
β 137 β β β ) β
β 138 β β except (KeyboardInterrupt, Exception) as e: β
β 139 β β β run_manager.on_chain_error(e) β
β β± 140 β β β raise e β
β 141 β β run_manager.on_chain_end(outputs) β
β 142 β β return self.prep_outputs(inputs, outputs, return_only_outputs) β
β 143 β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/base.py:134 in __call__ β
β β
β 131 β β ) β
β 132 β β try: β
β 133 β β β outputs = ( β
β β± 134 β β β β self._call(inputs, run_manager=run_manager) β
β 135 β β β β if new_arg_supported β
β 136 β β β β else self._call(inputs) β
β 137 β β β ) β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/combine_documents/base.py:84 in β
β _call β
β β
β 81 β β docs = inputs[self.input_key] β
β 82 β β # Other keys are assumed to be needed for LLM prediction β
β 83 β β other_keys = {k: v for k, v in inputs.items() if k != self.input_key} β
β β± 84 β β output, extra_return_dict = self.combine_docs( β
β 85 β β β docs, callbacks=_run_manager.get_child(), **other_keys β
β 86 β β ) β
β 87 β β extra_return_dict[self.output_key] = output β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/combine_documents/map_reduce.py:1 β
β 44 in combine_docs β
β β
β 141 β β Combine by mapping first chain over all documents, then reducing the results. β
β 142 β β This reducing can be done recursively if needed (if there are many documents). β
β 143 β β """ β
β β± 144 β β results = self.llm_chain.apply( β
β 145 β β β # FYI - this is parallelized and so it is fast. β
β 146 β β β [{self.document_variable_name: d.page_content, **kwargs} for d in docs], β
β 147 β β β callbacks=callbacks, β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/llm.py:157 in apply β
β β
β 154 β β β response = self.generate(input_list, run_manager=run_manager) β
β 155 β β except (KeyboardInterrupt, Exception) as e: β
β 156 β β β run_manager.on_chain_error(e) β
β β± 157 β β β raise e β
β 158 β β outputs = self.create_outputs(response) β
β 159 β β run_manager.on_chain_end({"outputs": outputs}) β
β 160 β β return outputs β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/llm.py:154 in apply β
β β
β 151 β β β {"input_list": input_list}, β
β 152 β β ) β
β 153 β β try: β
β β± 154 β β β response = self.generate(input_list, run_manager=run_manager) β
β 155 β β except (KeyboardInterrupt, Exception) as e: β
β 156 β β β run_manager.on_chain_error(e) β
β 157 β β β raise e β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/chains/llm.py:79 in generate β
β β
β 76 β ) -> LLMResult: β
β 77 β β """Generate LLM result from inputs.""" β
β 78 β β prompts, stop = self.prep_prompts(input_list, run_manager=run_manager) β
β β± 79 β β return self.llm.generate_prompt( β
β 80 β β β prompts, stop, callbacks=run_manager.get_child() if run_manager else None β
β 81 β β ) β
β 82 β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/llms/base.py:134 in generate_prompt β
β β
β 131 β β callbacks: Callbacks = None, β
β 132 β ) -> LLMResult: β
β 133 β β prompt_strings = [p.to_string() for p in prompts] β
β β± 134 β β return self.generate(prompt_strings, stop=stop, callbacks=callbacks) β
β 135 β β
β 136 β async def agenerate_prompt( β
β 137 β β self, β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/llms/base.py:191 in generate β
β β
β 188 β β β β ) β
β 189 β β β except (KeyboardInterrupt, Exception) as e: β
β 190 β β β β run_manager.on_llm_error(e) β
β β± 191 β β β β raise e β
β 192 β β β run_manager.on_llm_end(output) β
β 193 β β β return output β
β 194 β β if len(missing_prompts) > 0: β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/llms/base.py:185 in generate β
β β
β 182 β β β ) β
β 183 β β β try: β
β 184 β β β β output = ( β
β β± 185 β β β β β self._generate(prompts, stop=stop, run_manager=run_manager) β
β 186 β β β β β if new_arg_supported β
β 187 β β β β β else self._generate(prompts, stop=stop) β
β 188 β β β β ) β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/llms/base.py:436 in _generate β
β β
β 433 β β new_arg_supported = inspect.signature(self._call).parameters.get("run_manager") β
β 434 β β for prompt in prompts: β
β 435 β β β text = ( β
β β± 436 β β β β self._call(prompt, stop=stop, run_manager=run_manager) β
β 437 β β β β if new_arg_supported β
β 438 β β β β else self._call(prompt, stop=stop) β
β 439 β β β ) β
β β
β /home/talha/venv/lib/python3.10/site-packages/langchain/llms/huggingface_pipeline.py:159 in β
β _call β
β β
β 156 β β stop: Optional[List[str]] = None, β
β 157 β β run_manager: Optional[CallbackManagerForLLMRun] = None, β
β 158 β ) -> str: β
β β± 159 β β response = self.pipeline(prompt) β
β 160 β β if self.pipeline.task == "text-generation": β
β 161 β β β # Text generation return includes the starter text. β
β 162 β β β text = response[0]["generated_text"][len(prompt) :] β
β β
β /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/text_generation.py:201 in β
β __call__ β
β β
β 198 β β β - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `retu β
β 199 β β β ids of the generated text. β
β 200 β β """ β
β β± 201 β β return super().__call__(text_inputs, **kwargs) β
β 202 β β
β 203 β def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate β
β 204 β β inputs = self.tokenizer( β
β β
β /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/base.py:1118 in __call__ β
β β
β 1115 β β β β ) β
β 1116 β β β ) β
β 1117 β β else: β
β β± 1118 β β β return self.run_single(inputs, preprocess_params, forward_params, postproces β
β 1119 β β
β 1120 β def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params): β
β 1121 β β return [self.run_single(item, preprocess_params, forward_params, postprocess_par β
β β
β /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/base.py:1125 in run_single β
β β
β 1122 β β
β 1123 β def run_single(self, inputs, preprocess_params, forward_params, postprocess_params): β
β 1124 β β model_inputs = self.preprocess(inputs, **preprocess_params) β
β β± 1125 β β model_outputs = self.forward(model_inputs, **forward_params) β
β 1126 β β outputs = self.postprocess(model_outputs, **postprocess_params) β
β 1127 β β return outputs β
β 1128 β
β β
β /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/base.py:1024 in forward β
β β
β 1021 β β β β inference_context = self.get_inference_context() β
β 1022 β β β β with inference_context(): β
β 1023 β β β β β model_inputs = self._ensure_tensor_on_device(model_inputs, device=se β
β β± 1024 β β β β β model_outputs = self._forward(model_inputs, **forward_params) β
β 1025 β β β β β model_outputs = self._ensure_tensor_on_device(model_outputs, device= β
β 1026 β β β else: β
β 1027 β β β β raise ValueError(f"Framework {self.framework} is not supported") β
β β
β /home/talha/venv/lib/python3.10/site-packages/transformers/pipelines/text_generation.py:263 in β
β _forward β
β β
β 260 β β β β generate_kwargs["min_length"] += prefix_length β
β 261 β β β
β 262 β β # BS x SL β
β β± 263 β β generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=att β
β 264 β β out_b = generated_sequence.shape[0] β
β 265 β β if self.framework == "pt": β
β 266 β β β generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *genera β
β β
β /home/talha/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:115 in decorate_context β
β β
β 112 β @functools.wraps(func) β
β 113 β def decorate_context(*args, **kwargs): β
β 114 β β with ctx_factory(): β
β β± 115 β β β return func(*args, **kwargs) β
β 116 β β
β 117 β return decorate_context β
β 118 β
β β
β /home/talha/venv/lib/python3.10/site-packages/transformers/generation/utils.py:1518 in generate β
β β
β 1515 β β β β ) β
β 1516 β β β β
β 1517 β β β # 11. run greedy search β
β β± 1518 β β β return self.greedy_search( β
β 1519 β β β β input_ids, β
β 1520 β β β β logits_processor=logits_processor, β
β 1521 β β β β stopping_criteria=stopping_criteria, β
β β
β /home/talha/venv/lib/python3.10/site-packages/transformers/generation/utils.py:2335 in β
β greedy_search β
β β
β 2332 β β β model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) β
β 2333 β β β β
β 2334 β β β # forward pass to get next token β
β β± 2335 β β β outputs = self( β
β 2336 β β β β **model_inputs, β
β 2337 β β β β return_dict=True, β
β 2338 β β β β output_attentions=output_attentions, β
β β
β /home/talha/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl β
β β
β 1498 β β if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks β
β 1499 β β β β or _global_backward_pre_hooks or _global_backward_hooks β
β 1500 β β β β or _global_forward_hooks or _global_forward_pre_hooks): β
β β± 1501 β β β return forward_call(*args, **kwargs) β
β 1502 β β # Do not call functions when jit is used β
β 1503 β β full_backward_hooks, non_full_backward_hooks = [], [] β
β 1504 β β backward_pre_hooks = [] β
β β
β /home/talha/venv/lib/python3.10/site-packages/accelerate/hooks.py:165 in new_forward β
β β
β 162 β β β with torch.no_grad(): β
β 163 β β β β output = old_forward(*args, **kwargs) β
β 164 β β else: β
β β± 165 β β β output = old_forward(*args, **kwargs) β
β 166 β β return module._hf_hook.post_forward(module, output) β
β 167 β β
β 168 β module.forward = new_forward β
β β
β /home/talha/.cache/huggingface/modules/transformers_modules/falcon-7b/modelling_RW.py:753 in β
β forward β
β β
β 750 β β β
β 751 β β return_dict = return_dict if return_dict is not None else self.config.use_return β
β 752 β β β
β β± 753 β β transformer_outputs = self.transformer( β
β 754 β β β input_ids, β
β 755 β β β past_key_values=past_key_values, β
β 756 β β β attention_mask=attention_mask, β
β β
β /home/talha/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl β
β β
β 1498 β β if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks β
β 1499 β β β β or _global_backward_pre_hooks or _global_backward_hooks β
β 1500 β β β β or _global_forward_hooks or _global_forward_pre_hooks): β
β β± 1501 β β β return forward_call(*args, **kwargs) β
β 1502 β β # Do not call functions when jit is used β
β 1503 β β full_backward_hooks, non_full_backward_hooks = [], [] β
β 1504 β β backward_pre_hooks = [] β
β β
β /home/talha/venv/lib/python3.10/site-packages/accelerate/hooks.py:165 in new_forward β
β β
β 162 β β β with torch.no_grad(): β
β 163 β β β β output = old_forward(*args, **kwargs) β
β 164 β β else: β
β β± 165 β β β output = old_forward(*args, **kwargs) β
β 166 β β return module._hf_hook.post_forward(module, output) β
β 167 β β
β 168 β module.forward = new_forward β
β β
β /home/talha/.cache/huggingface/modules/transformers_modules/falcon-7b/modelling_RW.py:648 in β
β forward β
β β
β 645 β β β β β head_mask[i], β
β 646 β β β β ) β
β 647 β β β else: β
β β± 648 β β β β outputs = block( β
β 649 β β β β β hidden_states, β
β 650 β β β β β layer_past=layer_past, β
β 651 β β β β β attention_mask=causal_mask, β
β β
β /home/talha/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl β
β β
β 1498 β β if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks β
β 1499 β β β β or _global_backward_pre_hooks or _global_backward_hooks β
β 1500 β β β β or _global_forward_hooks or _global_forward_pre_hooks): β
β β± 1501 β β β return forward_call(*args, **kwargs) β
β 1502 β β # Do not call functions when jit is used β
β 1503 β β full_backward_hooks, non_full_backward_hooks = [], [] β
β 1504 β β backward_pre_hooks = [] β
β β
β /home/talha/venv/lib/python3.10/site-packages/accelerate/hooks.py:165 in new_forward β
β β
β 162 β β β with torch.no_grad(): β
β 163 β β β β output = old_forward(*args, **kwargs) β
β 164 β β else: β
β β± 165 β β β output = old_forward(*args, **kwargs) β
β 166 β β return module._hf_hook.post_forward(module, output) β
β 167 β β
β 168 β module.forward = new_forward β
β β
β /home/talha/.cache/huggingface/modules/transformers_modules/falcon-7b/modelling_RW.py:385 in β
β forward β
β β
β 382 β β residual = hidden_states β
β 383 β β β
β 384 β β # Self attention. β
β β± 385 β β attn_outputs = self.self_attention( β
β 386 β β β layernorm_output, β
β 387 β β β layer_past=layer_past, β
β 388 β β β attention_mask=attention_mask, β
β β
β /home/talha/venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in _call_impl β
β β
β 1498 β β if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks β
β 1499 β β β β or _global_backward_pre_hooks or _global_backward_hooks β
β 1500 β β β β or _global_forward_hooks or _global_forward_pre_hooks): β
β β± 1501 β β β return forward_call(*args, **kwargs) β
β 1502 β β # Do not call functions when jit is used β
β 1503 β β full_backward_hooks, non_full_backward_hooks = [], [] β
β 1504 β β backward_pre_hooks = [] β
β β
β /home/talha/venv/lib/python3.10/site-packages/accelerate/hooks.py:165 in new_forward β
β β
β 162 β β β with torch.no_grad(): β
β 163 β β β β output = old_forward(*args, **kwargs) β
β 164 β β else: β
β β± 165 β β β output = old_forward(*args, **kwargs) β
β 166 β β return module._hf_hook.post_forward(module, output) β
β 167 β β
β 168 β module.forward = new_forward β
β β
β /home/talha/.cache/huggingface/modules/transformers_modules/falcon-7b/modelling_RW.py:279 in β
β forward β
β β
β 276 β β β key_layer_ = key_layer.reshape(batch_size, self.num_kv, -1, self.head_dim) β
β 277 β β β value_layer_ = value_layer.reshape(batch_size, self.num_kv, -1, self.head_di β
β 278 β β β β
β β± 279 β β β attn_output = F.scaled_dot_product_attention( β
β 280 β β β β query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True β
β 281 β β β ) β
β 282 β
β°ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Thanks for the report, should be resolved with: https://huggingface.co/tiiuae/falcon-7b/commit/1ba2370c784b56f8b31afc66d5234e8fb40a7209.
Though note that inference with datatypes other than bfloat16 has not been fully validated and may incur some model degradation.
Let us know if you have any more issues.
FalconLLM
changed discussion status to
closed