hf-llm-api

Running

Hansimov commited on Jan 28

Commit

e2b245b

•

1 Parent(s): 395e196

:gem: [Feature] Add use_cache parameter, and set default temperature to 0.5

Files changed (2) hide show

apis/chat_api.py CHANGED Viewed

@@ -88,13 +88,17 @@ class ChatAPIApp:
             description="(list) Messages",
         )
         temperature: Union[float, None] = Field(
-            default=0,
             description="(float) Temperature",
         )
         max_tokens: Union[int, None] = Field(
             default=-1,
             description="(int) Max tokens",
         )
         stream: bool = Field(
             default=True,
             description="(bool) Stream",
@@ -113,6 +117,7 @@ class ChatAPIApp:
             temperature=item.temperature,
             max_new_tokens=item.max_tokens,
             api_key=api_key,
         )
         if item.stream:
             event_source_response = EventSourceResponse(

             description="(list) Messages",
         )
         temperature: Union[float, None] = Field(
+            default=0.5,
             description="(float) Temperature",
         )
         max_tokens: Union[int, None] = Field(
             default=-1,
             description="(int) Max tokens",
         )
+        use_cache: bool = Field(
+            default=False,
+            description="(bool) Use cache",
+        )
         stream: bool = Field(
             default=True,
             description="(bool) Stream",
             temperature=item.temperature,
             max_new_tokens=item.max_tokens,
             api_key=api_key,
+            use_cache=item.use_cache,
         )
         if item.stream:
             event_source_response = EventSourceResponse(

networks/message_streamer.py CHANGED Viewed

@@ -61,9 +61,10 @@ class MessageStreamer:
     def chat_response(
         self,
         prompt: str = None,
-        temperature: float = 0,
         max_new_tokens: int = None,
         api_key: str = None,
     ):
         # https://huggingface.co/docs/api-inference/detailed_parameters?code=curl
         # curl --proxy http://<server>:<port> https://api-inference.huggingface.co/models/<org>/<model_name> -X POST -d '{"inputs":"who are you?","parameters":{"max_new_token":64}}' -H 'Content-Type: application/json' -H 'Authorization: Bearer <HF_TOKEN>'
@@ -105,6 +106,7 @@ class MessageStreamer:
         #   huggingface_hub/inference/_text_generation.py:
         #     class TextGenerationRequest > param `stream`
         # https://huggingface.co/docs/text-generation-inference/conceptual/streaming#streaming-with-curl
         self.request_body = {
             "inputs": prompt,
             "parameters": {
@@ -112,6 +114,9 @@ class MessageStreamer:
                 "max_new_tokens": max_new_tokens,
                 "return_full_text": False,
             },
             "stream": True,
         }

     def chat_response(
         self,
         prompt: str = None,
+        temperature: float = 0.5,
         max_new_tokens: int = None,
         api_key: str = None,
+        use_cache: bool = False,
     ):
         # https://huggingface.co/docs/api-inference/detailed_parameters?code=curl
         # curl --proxy http://<server>:<port> https://api-inference.huggingface.co/models/<org>/<model_name> -X POST -d '{"inputs":"who are you?","parameters":{"max_new_token":64}}' -H 'Content-Type: application/json' -H 'Authorization: Bearer <HF_TOKEN>'
         #   huggingface_hub/inference/_text_generation.py:
         #     class TextGenerationRequest > param `stream`
         # https://huggingface.co/docs/text-generation-inference/conceptual/streaming#streaming-with-curl
+        # https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task
         self.request_body = {
             "inputs": prompt,
             "parameters": {
                 "max_new_tokens": max_new_tokens,
                 "return_full_text": False,
             },
+            "options": {
+                "use_cache": use_cache,
+            },
             "stream": True,
         }