Spaces:

ggml-org
/

gguf-my-lora

Running on CPU Upgrade

File size: 7,495 Bytes

import os
import subprocess
import signal
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
import gradio as gr
import tempfile

from huggingface_hub import HfApi, ModelCard, whoami
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from pathlib import Path
from textwrap import dedent
from apscheduler.schedulers.background import BackgroundScheduler


HF_TOKEN = os.environ.get("HF_TOKEN")
CONVERSION_SCRIPT = "convert_lora_to_gguf.py"

def process_model(peft_model_id: str, q_method: str, private_repo, oauth_token: gr.OAuthToken | None):
    if oauth_token.token is None:
        raise ValueError("You must be logged in to use GGUF-my-lora")
    model_name = peft_model_id.split('/')[-1]
    gguf_output_name = f"{model_name}-{q_method.lower()}.gguf"

    try:
        api = HfApi(token=oauth_token.token)

        dl_pattern = ["*.md", "*.json", "*.model"]

        pattern = (
            "*.safetensors"
            if any(
                file.path.endswith(".safetensors")
                for file in api.list_repo_tree(
                    repo_id=peft_model_id,
                    recursive=True,
                )
            )
            else "*.bin"
        )

        dl_pattern += [pattern]

        if not os.path.exists("downloads"):
            os.makedirs("downloads")

        if not os.path.exists("outputs"):
            os.makedirs("outputs")

        with tempfile.TemporaryDirectory(dir="outputs") as outputdir:
            gguf_output_path = Path(outputdir)/gguf_output_name
            readme_output_path = Path(outputdir)/"README.md"

            with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
                # Keep the model name as the dirname so the model name metadata is populated correctly
                local_dir = Path(tmpdir)/model_name
                print(local_dir)
                api.snapshot_download(repo_id=peft_model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
                print("Model downloaded successfully!")
                print(f"Current working directory: {os.getcwd()}")
                print(f"Model directory contents: {os.listdir(local_dir)}")

                adapter_config_dir = local_dir/"adapter_config.json"
                if not os.path.exists(adapter_config_dir):
                    raise Exception('adapter_config.json not found. Please ensure the selected repo is a PEFT LoRA model.<br/><br/>If you are converting a model (not a LoRA adapter), please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-repo" target="_blank" style="text-decoration:underline">GGUF-my-repo</a> instead.')

                result = subprocess.run([
                    "python",
                    f"llama.cpp/{CONVERSION_SCRIPT}",
                    local_dir,
                    "--outtype",
                    q_method.lower(),
                    "--outfile",
                    gguf_output_path,
                ], shell=False, capture_output=True)
                print(result)
                if result.returncode != 0:
                    raise Exception(f"Error converting to GGUF {q_method}: {result.stderr}")
                print("Model converted to GGUF successfully!")
                print(f"Converted model path: {gguf_output_path}")

            # Create empty repo
            username = whoami(oauth_token.token)["name"]
            new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
            new_repo_id = new_repo_url.repo_id
            print("Repo created successfully!", new_repo_url)

            # Upload the GGUF model
            api.upload_file(
                path_or_fileobj=gguf_output_path,
                path_in_repo=gguf_output_name,
                repo_id=new_repo_id,
            )
            print("Uploaded", gguf_output_name)

            try:
                card = ModelCard.load(peft_model_id, token=oauth_token.token)
            except:
                card = ModelCard("")
            if card.data.tags is None:
                card.data.tags = []
            card.data.tags.append("llama-cpp")
            card.data.tags.append("gguf-my-lora")
            card.data.base_model = peft_model_id
            card.text = dedent(
                f"""
                # {new_repo_id}
                This LoRA adapter was converted to GGUF format from [`{peft_model_id}`](https://huggingface.co/{peft_model_id}) via the ggml.ai's [GGUF-my-lora](https://huggingface.co/spaces/ggml-org/gguf-my-lora) space.
                Refer to the [original adapter repository](https://huggingface.co/{peft_model_id}) for more details.
                
                ## Use with llama.cpp
                
                ```bash
                # with cli
                llama-cli -m base_model.gguf --lora {gguf_output_name} (...other args)

                # with server
                llama-server -m base_model.gguf --lora {gguf_output_name} (...other args)
                ```

                To know more about LoRA usage with llama.cpp server, refer to the [llama.cpp server documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md).
                """
            )
            card.save(readme_output_path)

            api.upload_file(
                path_or_fileobj=readme_output_path,
                path_in_repo="README.md",
                repo_id=new_repo_id,
            )

        return (
            f'<h1>✅ DONE</h1><br/><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>'
        )
    except Exception as e:
        return (f"<h1>❌ ERROR</h1><br/><br/>{e}")


css="""/* Custom CSS to allow scrolling */
.gradio-container {overflow-y: auto;}
"""
# Create Gradio interface
with gr.Blocks(css=css) as demo: 
    gr.Markdown("You must be logged in to use GGUF-my-lora.")
    gr.LoginButton(min_width=250)

    peft_model_id = HuggingfaceHubSearch(
        label="PEFT LoRA repository",
        placeholder="Search for repository on Huggingface",
        search_type="model",
    )

    q_method = gr.Dropdown(
        ["F32", "F16", "Q8_0"],
        label="Quantization Method",
        info="(Note: Quantization less than Q8 produces very poor results)",
        value="F16",
        filterable=False,
        visible=True
    )

    private_repo = gr.Checkbox(
        value=False,
        label="Private Repo",
        info="Create a private repo under your username."
    )

    iface = gr.Interface(
        fn=process_model,
        inputs=[
            peft_model_id,
            q_method,
            private_repo,
        ],
        outputs=[
            gr.Markdown(label="output"),
        ],
        title="Convert PEFT LoRA adapters to GGUF, blazingly fast ⚡!",
        description="The space takes a PEFT LoRA (stored in a HF repo) as input, converts it to GGUF and creates a Public repo under your HF user namespace.<br/><br/>For more information, please refer to [this blog post](https://huggingface.co/blog/ngxson/gguf-my-lora)",
        api_name=False
    )


def restart_space():
    HfApi().restart_space(repo_id="ggml-org/gguf-my-lora", token=HF_TOKEN, factory_reboot=True)

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=21600)
scheduler.start()

# Launch the interface
demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)