import random import requests from flask import Flask, request, Response, stream_with_context, render_template_string app = Flask(__name__) @app.route('/', methods=['GET']) def index(): template = ''' Huggingface Chat API Adapter

Huggingface Chat API Adapter

[Introduction]
When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.
This API changes these two default settings, and other parameters are consistent with the official API.

[How to use]
1. Create a token with the "Make calls to the serverless Inference API" permission as an API key.
2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".
3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)

[Supported models]
Most of the available models can be found HERE.
Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.
Some models require a token created by a PRO user to use.

[Avoid reaching the call limit]
If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")
''' return render_template_string(template) @app.route('/api/v1/chat/completions', methods=['POST']) def proxy(): headers = dict(request.headers) headers.pop('Host', None) headers.pop('Content-Length', None) keys = request.headers['Authorization'].split(' ')[1].split(';') headers['Authorization'] = f'Bearer {random.choice(keys)}' headers['X-Use-Cache'] = 'false' json_data = request.get_json() model = json_data['model'] chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions" # Try to use the largest ctx if not 'max_tokens' in json_data: json_data['max_tokens'] = 2**32-1 json_data['json_mode'] = True info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text json_data['json_mode'] = False try: max_ctx = int(info.split("<= ")[1].split(".")[0]) inputs = int(info.split("Given: ")[1].split("`")[0]) json_data['max_tokens'] = max_ctx - inputs - 1 except Exception as e: print(e) if not 'seed' in json_data: json_data['seed'] = random.randint(1,2**32) def generate(): with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp: for chunk in resp.iter_content(chunk_size=1024): if chunk: yield chunk return Response(stream_with_context(generate()), content_type='text/event-stream') #import gevent.pywsgi #from gevent import monkey;monkey.patch_all() if __name__ == "__main__": app.run(debug=True) # gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever()