Jofthomas's picture
Jofthomas HF staff
bulk
ce8b18b
import { HfInference } from "@huggingface/inference";
export const LLM_CONFIG = {
/* Hugginface config: */
ollama: false,
huggingface: true,
url: "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct",
chatModel: "meta-llama/Meta-Llama-3-8B-Instruct",
embeddingModel:
"https://api-inference.huggingface.co/models/mixedbread-ai/mxbai-embed-large-v1",
embeddingDimension: 1024,
/* Ollama (local) config:
*/
// ollama: true,
// url: 'http://127.0.0.1:11434',
// chatModel: 'llama3' as const,
// embeddingModel: 'mxbai-embed-large',
// embeddingDimension: 1024,
// embeddingModel: 'llama3',
// embeddingDimension: 4096,
/* Together.ai config:
ollama: false,
url: 'https://api.together.xyz',
chatModel: 'meta-llama/Llama-3-8b-chat-hf',
embeddingModel: 'togethercomputer/m2-bert-80M-8k-retrieval',
embeddingDimension: 768,
*/
/* OpenAI config:
ollama: false,
url: 'https://api.openai.com',
chatModel: 'gpt-3.5-turbo-16k',
embeddingModel: 'text-embedding-ada-002',
embeddingDimension: 1536,
*/
};
function apiUrl(path: string) {
// OPENAI_API_BASE and OLLAMA_HOST are legacy
const host =
process.env.LLM_API_URL ??
process.env.OLLAMA_HOST ??
process.env.OPENAI_API_BASE ??
LLM_CONFIG.url;
if (host.endsWith("/") && path.startsWith("/")) {
return host + path.slice(1);
} else if (!host.endsWith("/") && !path.startsWith("/")) {
return host + "/" + path;
} else {
return host + path;
}
}
function apiKey() {
return process.env.LLM_API_KEY ?? process.env.OPENAI_API_KEY;
}
const AuthHeaders = (): Record<string, string> =>
apiKey()
? {
Authorization: "Bearer " + apiKey(),
}
: {};
// Overload for non-streaming
export async function chatCompletion(
body: Omit<CreateChatCompletionRequest, "model"> & {
model?: CreateChatCompletionRequest["model"];
} & {
stream?: false | null | undefined;
}
): Promise<{ content: string; retries: number; ms: number }>;
// Overload for streaming
export async function chatCompletion(
body: Omit<CreateChatCompletionRequest, "model"> & {
model?: CreateChatCompletionRequest["model"];
} & {
stream?: true;
}
): Promise<{ content: ChatCompletionContent; retries: number; ms: number }>;
export async function chatCompletion(
body: Omit<CreateChatCompletionRequest, "model"> & {
model?: CreateChatCompletionRequest["model"];
}
) {
assertApiKey();
// OLLAMA_MODEL is legacy
body.model =
body.model ??
process.env.LLM_MODEL ??
process.env.OLLAMA_MODEL ??
LLM_CONFIG.chatModel;
const stopWords = body.stop
? typeof body.stop === "string"
? [body.stop]
: body.stop
: [];
if (LLM_CONFIG.ollama || LLM_CONFIG.huggingface) stopWords.push("<|eot_id|>");
const {
result: content,
retries,
ms,
} = await retryWithBackoff(async () => {
const hf = new HfInference(apiKey());
const model = hf.endpoint(apiUrl("/v1/chat/completions"));
if (body.stream) {
const completion = model.chatCompletionStream({
...body,
});
return new ChatCompletionContent(completion, stopWords);
} else {
const completion = await model.chatCompletion({
...body,
});
const content = completion.choices[0].message?.content;
if (content === undefined) {
throw new Error(
"Unexpected result from OpenAI: " + JSON.stringify(completion)
);
}
return content;
}
});
return {
content,
retries,
ms,
};
}
export async function tryPullOllama(model: string, error: string) {
if (error.includes("try pulling")) {
console.error("Embedding model not found, pulling from Ollama");
const pullResp = await fetch(apiUrl("/api/pull"), {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ name: model }),
});
console.log("Pull response", await pullResp.text());
throw {
retry: true,
error: `Dynamically pulled model. Original error: ${error}`,
};
}
}
export async function fetchEmbeddingBatch(texts: string[]) {
if (LLM_CONFIG.ollama) {
return {
ollama: true as const,
embeddings: await Promise.all(
texts.map(async (t) => (await ollamaFetchEmbedding(t)).embedding)
),
};
}
assertApiKey();
if (LLM_CONFIG.huggingface) {
const result = await fetch(LLM_CONFIG.embeddingModel, {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-Wait-For-Model": "true",
...AuthHeaders(),
},
body: JSON.stringify({
inputs: texts.map((text) => text.replace(/\n/g, " ")),
}),
});
const embeddings = await result.json();
return {
ollama: true as const,
embeddings: embeddings,
};
}
const {
result: json,
retries,
ms,
} = await retryWithBackoff(async () => {
const result = await fetch(apiUrl("/v1/embeddings"), {
method: "POST",
headers: {
"Content-Type": "application/json",
...AuthHeaders(),
},
body: JSON.stringify({
model: LLM_CONFIG.embeddingModel,
input: texts.map((text) => text.replace(/\n/g, " ")),
}),
});
if (!result.ok) {
throw {
retry: result.status === 429 || result.status >= 500,
error: new Error(
`Embedding failed with code ${result.status}: ${await result.text()}`
),
};
}
return (await result.json()) as CreateEmbeddingResponse;
});
if (json.data.length !== texts.length) {
console.error(json);
throw new Error("Unexpected number of embeddings");
}
const allembeddings = json.data;
allembeddings.sort((a, b) => a.index - b.index);
return {
ollama: false as const,
embeddings: allembeddings.map(({ embedding }) => embedding),
usage: json.usage?.total_tokens,
retries,
ms,
};
}
export async function fetchEmbedding(text: string) {
const { embeddings, ...stats } = await fetchEmbeddingBatch([text]);
return { embedding: embeddings[0], ...stats };
}
export async function fetchModeration(content: string) {
assertApiKey();
const { result: flagged } = await retryWithBackoff(async () => {
const result = await fetch(apiUrl("/v1/moderations"), {
method: "POST",
headers: {
"Content-Type": "application/json",
...AuthHeaders(),
},
body: JSON.stringify({
input: content,
}),
});
if (!result.ok) {
throw {
retry: result.status === 429 || result.status >= 500,
error: new Error(
`Embedding failed with code ${result.status}: ${await result.text()}`
),
};
}
return (await result.json()) as { results: { flagged: boolean }[] };
});
return flagged;
}
export function assertApiKey() {
if (!LLM_CONFIG.ollama && !apiKey()) {
throw new Error(
"\n Missing LLM_API_KEY in environment variables.\n\n" +
(LLM_CONFIG.ollama ? "just" : "npx") +
" convex env set LLM_API_KEY 'your-key'"
);
}
}
// Retry after this much time, based on the retry number.
const RETRY_BACKOFF = [1000, 10_000, 20_000]; // In ms
const RETRY_JITTER = 100; // In ms
type RetryError = { retry: boolean; error: any };
export async function retryWithBackoff<T>(
fn: () => Promise<T>
): Promise<{ retries: number; result: T; ms: number }> {
let i = 0;
for (; i <= RETRY_BACKOFF.length; i++) {
try {
const start = Date.now();
const result = await fn();
const ms = Date.now() - start;
return { result, retries: i, ms };
} catch (e) {
const retryError = e as RetryError;
if (i < RETRY_BACKOFF.length) {
if (retryError.retry) {
console.log(
`Attempt ${i + 1} failed, waiting ${
RETRY_BACKOFF[i]
}ms to retry...`,
Date.now()
);
await new Promise((resolve) =>
setTimeout(resolve, RETRY_BACKOFF[i] + RETRY_JITTER * Math.random())
);
continue;
}
}
if (retryError.error) throw retryError.error;
else throw e;
}
}
throw new Error("Unreachable");
}
// Lifted from openai's package
export interface LLMMessage {
/**
* The contents of the message. `content` is required for all messages, and may be
* null for assistant messages with function calls.
*/
content: string | null;
/**
* The role of the messages author. One of `system`, `user`, `assistant`, or
* `function`.
*/
role: "system" | "user" | "assistant" | "function";
/**
* The name of the author of this message. `name` is required if role is
* `function`, and it should be the name of the function whose response is in the
* `content`. May contain a-z, A-Z, 0-9, and underscores, with a maximum length of
* 64 characters.
*/
name?: string;
/**
* The name and arguments of a function that should be called, as generated by the model.
*/
function_call?: {
// The name of the function to call.
name: string;
/**
* The arguments to call the function with, as generated by the model in
* JSON format. Note that the model does not always generate valid JSON,
* and may hallucinate parameters not defined by your function schema.
* Validate the arguments in your code before calling your function.
*/
arguments: string;
};
}
// Non-streaming chat completion response
interface CreateChatCompletionResponse {
id: string;
object: string;
created: number;
model: string;
choices: {
index?: number;
message?: {
role: "system" | "user" | "assistant";
content: string;
};
finish_reason?: string;
}[];
usage?: {
completion_tokens: number;
prompt_tokens: number;
total_tokens: number;
};
}
interface CreateEmbeddingResponse {
data: {
index: number;
object: string;
embedding: number[];
}[];
model: string;
object: string;
usage: {
prompt_tokens: number;
total_tokens: number;
};
}
export interface CreateChatCompletionRequest {
/**
* ID of the model to use.
* @type {string}
* @memberof CreateChatCompletionRequest
*/
model: string;
// | 'gpt-4'
// | 'gpt-4-0613'
// | 'gpt-4-32k'
// | 'gpt-4-32k-0613'
// | 'gpt-3.5-turbo'
// | 'gpt-3.5-turbo-0613'
// | 'gpt-3.5-turbo-16k' // <- our default
// | 'gpt-3.5-turbo-16k-0613';
/**
* The messages to generate chat completions for, in the chat format:
* https://platform.openai.com/docs/guides/chat/introduction
* @type {Array<ChatCompletionRequestMessage>}
* @memberof CreateChatCompletionRequest
*/
messages: LLMMessage[];
/**
* What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.
* @type {number}
* @memberof CreateChatCompletionRequest
*/
temperature?: number | null;
/**
* An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. We generally recommend altering this or `temperature` but not both.
* @type {number}
* @memberof CreateChatCompletionRequest
*/
top_p?: number | null;
/**
* How many chat completion choices to generate for each input message.
* @type {number}
* @memberof CreateChatCompletionRequest
*/
n?: number | null;
/**
* If set, partial message deltas will be sent, like in ChatGPT. Tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) as they become available, with the stream terminated by a `data: [DONE]` message.
* @type {boolean}
* @memberof CreateChatCompletionRequest
*/
stream?: boolean | null;
/**
*
* @type {CreateChatCompletionRequestStop}
* @memberof CreateChatCompletionRequest
*/
stop?: Array<string> | string;
/**
* The maximum number of tokens allowed for the generated answer. By default,
* the number of tokens the model can return will be (4096 - prompt tokens).
* @type {number}
* @memberof CreateChatCompletionRequest
*/
max_tokens?: number;
/**
* Number between -2.0 and 2.0. Positive values penalize new tokens based on
* whether they appear in the text so far, increasing the model\'s likelihood
* to talk about new topics. See more information about frequency and
* presence penalties:
* https://platform.openai.com/docs/api-reference/parameter-details
* @type {number}
* @memberof CreateChatCompletionRequest
*/
presence_penalty?: number | null;
/**
* Number between -2.0 and 2.0. Positive values penalize new tokens based on
* their existing frequency in the text so far, decreasing the model\'s
* likelihood to repeat the same line verbatim. See more information about
* presence penalties:
* https://platform.openai.com/docs/api-reference/parameter-details
* @type {number}
* @memberof CreateChatCompletionRequest
*/
frequency_penalty?: number | null;
/**
* Modify the likelihood of specified tokens appearing in the completion.
* Accepts a json object that maps tokens (specified by their token ID in the
* tokenizer) to an associated bias value from -100 to 100. Mathematically,
* the bias is added to the logits generated by the model prior to sampling.
* The exact effect will vary per model, but values between -1 and 1 should
* decrease or increase likelihood of selection; values like -100 or 100
* should result in a ban or exclusive selection of the relevant token.
* @type {object}
* @memberof CreateChatCompletionRequest
*/
logit_bias?: object | null;
/**
* A unique identifier representing your end-user, which can help OpenAI to
* monitor and detect abuse. Learn more:
* https://platform.openai.com/docs/guides/safety-best-practices/end-user-ids
* @type {string}
* @memberof CreateChatCompletionRequest
*/
user?: string;
tools?: {
// The type of the tool. Currently, only function is supported.
type: "function";
function: {
/**
* The name of the function to be called. Must be a-z, A-Z, 0-9, or
* contain underscores and dashes, with a maximum length of 64.
*/
name: string;
/**
* A description of what the function does, used by the model to choose
* when and how to call the function.
*/
description?: string;
/**
* The parameters the functions accepts, described as a JSON Schema
* object. See the guide[1] for examples, and the JSON Schema reference[2]
* for documentation about the format.
* [1]: https://platform.openai.com/docs/guides/gpt/function-calling
* [2]: https://json-schema.org/understanding-json-schema/
* To describe a function that accepts no parameters, provide the value
* {"type": "object", "properties": {}}.
*/
parameters: object;
};
}[];
/**
* Controls which (if any) function is called by the model. `none` means the
* model will not call a function and instead generates a message.
* `auto` means the model can pick between generating a message or calling a
* function. Specifying a particular function via
* {"type: "function", "function": {"name": "my_function"}} forces the model
* to call that function.
*
* `none` is the default when no functions are present.
* `auto` is the default if functions are present.
*/
tool_choice?:
| "none" // none means the model will not call a function and instead generates a message.
| "auto" // auto means the model can pick between generating a message or calling a function.
// Specifies a tool the model should use. Use to force the model to call
// a specific function.
| {
// The type of the tool. Currently, only function is supported.
type: "function";
function: { name: string };
};
// Replaced by "tools"
// functions?: {
// /**
// * The name of the function to be called. Must be a-z, A-Z, 0-9, or
// * contain underscores and dashes, with a maximum length of 64.
// */
// name: string;
// /**
// * A description of what the function does, used by the model to choose
// * when and how to call the function.
// */
// description?: string;
// /**
// * The parameters the functions accepts, described as a JSON Schema
// * object. See the guide[1] for examples, and the JSON Schema reference[2]
// * for documentation about the format.
// * [1]: https://platform.openai.com/docs/guides/gpt/function-calling
// * [2]: https://json-schema.org/understanding-json-schema/
// * To describe a function that accepts no parameters, provide the value
// * {"type": "object", "properties": {}}.
// */
// parameters: object;
// }[];
// /**
// * Controls how the model responds to function calls. "none" means the model
// * does not call a function, and responds to the end-user. "auto" means the
// * model can pick between an end-user or calling a function. Specifying a
// * particular function via {"name":\ "my_function"} forces the model to call
// * that function.
// * - "none" is the default when no functions are present.
// * - "auto" is the default if functions are present.
// */
// function_call?: 'none' | 'auto' | { name: string };
/**
* An object specifying the format that the model must output.
*
* Setting to { "type": "json_object" } enables JSON mode, which guarantees
* the message the model generates is valid JSON.
* *Important*: when using JSON mode, you must also instruct the model to
* produce JSON yourself via a system or user message. Without this, the model
* may generate an unending stream of whitespace until the generation reaches
* the token limit, resulting in a long-running and seemingly "stuck" request.
* Also note that the message content may be partially cut off if
* finish_reason="length", which indicates the generation exceeded max_tokens
* or the conversation exceeded the max context length.
*/
response_format?: { type: "text" | "json_object" };
}
// Checks whether a suffix of s1 is a prefix of s2. For example,
// ('Hello', 'Kira:') -> false
// ('Hello Kira', 'Kira:') -> true
const suffixOverlapsPrefix = (s1: string, s2: string) => {
for (let i = 1; i <= Math.min(s1.length, s2.length); i++) {
const suffix = s1.substring(s1.length - i);
const prefix = s2.substring(0, i);
if (suffix === prefix) {
return true;
}
}
return false;
};
export class ChatCompletionContent {
private readonly completion: AsyncIterable<ChatCompletionChunk>;
private readonly stopWords: string[];
constructor(
completion: AsyncIterable<ChatCompletionChunk>,
stopWords: string[]
) {
this.completion = completion;
this.stopWords = stopWords;
}
async *readInner() {
for await (const chunk of this.completion) {
yield chunk.choices[0].delta.content;
}
}
// stop words in OpenAI api don't always work.
// So we have to truncate on our side.
async *read() {
let lastFragment = "";
for await (const data of this.readInner()) {
lastFragment += data;
let hasOverlap = false;
for (const stopWord of this.stopWords) {
const idx = lastFragment.indexOf(stopWord);
if (idx >= 0) {
yield lastFragment.substring(0, idx);
return;
}
if (suffixOverlapsPrefix(lastFragment, stopWord)) {
hasOverlap = true;
}
}
if (hasOverlap) continue;
yield lastFragment;
lastFragment = "";
}
yield lastFragment;
}
async readAll() {
let allContent = "";
for await (const chunk of this.read()) {
allContent += chunk;
}
return allContent;
}
}
export async function ollamaFetchEmbedding(text: string) {
const { result } = await retryWithBackoff(async () => {
const resp = await fetch(apiUrl("/api/embeddings"), {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ model: LLM_CONFIG.embeddingModel, prompt: text }),
});
if (resp.status === 404) {
const error = await resp.text();
await tryPullOllama(LLM_CONFIG.embeddingModel, error);
throw new Error(`Failed to fetch embeddings: ${resp.status}`);
}
return (await resp.json()).embedding as number[];
});
return { embedding: result };
}