File size: 7,495 Bytes
57c7ce1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a84add4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57c7ce1
a84add4
 
 
 
 
57c7ce1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52d3d8e
57c7ce1
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
import subprocess
import signal
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
import gradio as gr
import tempfile

from huggingface_hub import HfApi, ModelCard, whoami
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from pathlib import Path
from textwrap import dedent
from apscheduler.schedulers.background import BackgroundScheduler


HF_TOKEN = os.environ.get("HF_TOKEN")
CONVERSION_SCRIPT = "convert_lora_to_gguf.py"

def process_model(peft_model_id: str, q_method: str, private_repo, oauth_token: gr.OAuthToken | None):
    if oauth_token.token is None:
        raise ValueError("You must be logged in to use GGUF-my-lora")
    model_name = peft_model_id.split('/')[-1]
    gguf_output_name = f"{model_name}-{q_method.lower()}.gguf"

    try:
        api = HfApi(token=oauth_token.token)

        dl_pattern = ["*.md", "*.json", "*.model"]

        pattern = (
            "*.safetensors"
            if any(
                file.path.endswith(".safetensors")
                for file in api.list_repo_tree(
                    repo_id=peft_model_id,
                    recursive=True,
                )
            )
            else "*.bin"
        )

        dl_pattern += [pattern]

        if not os.path.exists("downloads"):
            os.makedirs("downloads")

        if not os.path.exists("outputs"):
            os.makedirs("outputs")

        with tempfile.TemporaryDirectory(dir="outputs") as outputdir:
            gguf_output_path = Path(outputdir)/gguf_output_name
            readme_output_path = Path(outputdir)/"README.md"

            with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
                # Keep the model name as the dirname so the model name metadata is populated correctly
                local_dir = Path(tmpdir)/model_name
                print(local_dir)
                api.snapshot_download(repo_id=peft_model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
                print("Model downloaded successfully!")
                print(f"Current working directory: {os.getcwd()}")
                print(f"Model directory contents: {os.listdir(local_dir)}")

                adapter_config_dir = local_dir/"adapter_config.json"
                if not os.path.exists(adapter_config_dir):
                    raise Exception('adapter_config.json not found. Please ensure the selected repo is a PEFT LoRA model.<br/><br/>If you are converting a model (not a LoRA adapter), please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-repo" target="_blank" style="text-decoration:underline">GGUF-my-repo</a> instead.')

                result = subprocess.run([
                    "python",
                    f"llama.cpp/{CONVERSION_SCRIPT}",
                    local_dir,
                    "--outtype",
                    q_method.lower(),
                    "--outfile",
                    gguf_output_path,
                ], shell=False, capture_output=True)
                print(result)
                if result.returncode != 0:
                    raise Exception(f"Error converting to GGUF {q_method}: {result.stderr}")
                print("Model converted to GGUF successfully!")
                print(f"Converted model path: {gguf_output_path}")

            # Create empty repo
            username = whoami(oauth_token.token)["name"]
            new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
            new_repo_id = new_repo_url.repo_id
            print("Repo created successfully!", new_repo_url)

            # Upload the GGUF model
            api.upload_file(
                path_or_fileobj=gguf_output_path,
                path_in_repo=gguf_output_name,
                repo_id=new_repo_id,
            )
            print("Uploaded", gguf_output_name)

            try:
                card = ModelCard.load(peft_model_id, token=oauth_token.token)
            except:
                card = ModelCard("")
            if card.data.tags is None:
                card.data.tags = []
            card.data.tags.append("llama-cpp")
            card.data.tags.append("gguf-my-lora")
            card.data.base_model = peft_model_id
            card.text = dedent(
                f"""
                # {new_repo_id}
                This LoRA adapter was converted to GGUF format from [`{peft_model_id}`](https://huggingface.co/{peft_model_id}) via the ggml.ai's [GGUF-my-lora](https://huggingface.co/spaces/ggml-org/gguf-my-lora) space.
                Refer to the [original adapter repository](https://huggingface.co/{peft_model_id}) for more details.
                
                ## Use with llama.cpp
                
                ```bash
                # with cli
                llama-cli -m base_model.gguf --lora {gguf_output_name} (...other args)

                # with server
                llama-server -m base_model.gguf --lora {gguf_output_name} (...other args)
                ```

                To know more about LoRA usage with llama.cpp server, refer to the [llama.cpp server documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md).
                """
            )
            card.save(readme_output_path)

            api.upload_file(
                path_or_fileobj=readme_output_path,
                path_in_repo="README.md",
                repo_id=new_repo_id,
            )

        return (
            f'<h1>✅ DONE</h1><br/><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>'
        )
    except Exception as e:
        return (f"<h1>❌ ERROR</h1><br/><br/>{e}")


css="""/* Custom CSS to allow scrolling */
.gradio-container {overflow-y: auto;}
"""
# Create Gradio interface
with gr.Blocks(css=css) as demo: 
    gr.Markdown("You must be logged in to use GGUF-my-lora.")
    gr.LoginButton(min_width=250)

    peft_model_id = HuggingfaceHubSearch(
        label="PEFT LoRA repository",
        placeholder="Search for repository on Huggingface",
        search_type="model",
    )

    q_method = gr.Dropdown(
        ["F32", "F16", "Q8_0"],
        label="Quantization Method",
        info="(Note: Quantization less than Q8 produces very poor results)",
        value="F16",
        filterable=False,
        visible=True
    )

    private_repo = gr.Checkbox(
        value=False,
        label="Private Repo",
        info="Create a private repo under your username."
    )

    iface = gr.Interface(
        fn=process_model,
        inputs=[
            peft_model_id,
            q_method,
            private_repo,
        ],
        outputs=[
            gr.Markdown(label="output"),
        ],
        title="Convert PEFT LoRA adapters to GGUF, blazingly fast ⚡!",
        description="The space takes a PEFT LoRA (stored in a HF repo) as input, converts it to GGUF and creates a Public repo under your HF user namespace.<br/><br/>For more information, please refer to [this blog post](https://huggingface.co/blog/ngxson/gguf-my-lora)",
        api_name=False
    )


def restart_space():
    HfApi().restart_space(repo_id="ggml-org/gguf-my-lora", token=HF_TOKEN, factory_reboot=True)

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=21600)
scheduler.start()

# Launch the interface
demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)