alonsosilva commited on
Commit
a539bdc
1 Parent(s): c29a7a5
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ # Set up a new user named "user" with user ID 1000 for permission
4
+ RUN useradd -m -u 1000 user
5
+ # Switch to the "user" user
6
+ USER user
7
+ # Set home to the user's home directory
8
+ ENV HOME=/home/user \
9
+ PATH=/home/user/.local/bin:$PATH
10
+
11
+ # Upgreade pip
12
+ RUN pip install --no-cache-dir --upgrade pip
13
+
14
+ COPY --chown=user requirements.txt requirements.txt
15
+
16
+ # Install requirements
17
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
18
+
19
+ COPY --chown=user app.py app.py
20
+
21
+ COPY --chown=user test_lancedb/ test_lancedb/
22
+
23
+ COPY --chown=user test_kuzudb/ test_kuzudb/
24
+
25
+ ENTRYPOINT ["solara", "run", "app.py", "--host=0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from dotenv import find_dotenv, load_dotenv
2
+ # _ = load_dotenv(find_dotenv())
3
+
4
+ import solara
5
+
6
+ import polars as pl
7
+
8
+ df = pl.read_csv(
9
+ "https://drive.google.com/uc?export=download&id=1uD3h7xYxr9EoZ0Ggoh99JtQXa3AxtxyU"
10
+ )
11
+
12
+ import string
13
+
14
+ df = df.with_columns(
15
+ pl.Series("Album", [string.capwords(album) for album in df["Album"]])
16
+ )
17
+ df = df.with_columns(pl.Series("Song", [string.capwords(song) for song in df["Song"]]))
18
+ df = df.with_columns(pl.col("Lyrics").fill_null("None"))
19
+
20
+ df = df.with_columns(
21
+ text=pl.lit("# ")
22
+ + pl.col("Album")
23
+ + pl.lit(": ")
24
+ + pl.col("Song")
25
+ + pl.lit("\n\n")
26
+ + pl.col("Lyrics")
27
+ )
28
+
29
+ import shutil
30
+ import lancedb
31
+
32
+ shutil.rmtree("test_lancedb", ignore_errors=True)
33
+ db = lancedb.connect("test_lancedb")
34
+
35
+ from lancedb.embeddings import get_registry
36
+
37
+ embeddings = (
38
+ get_registry()
39
+ .get("sentence-transformers")
40
+ .create(name="TaylorAI/gte-tiny", device="cpu")
41
+ )
42
+
43
+ from lancedb.pydantic import LanceModel, Vector
44
+
45
+
46
+ class Songs(LanceModel):
47
+ Song: str
48
+ Lyrics: str
49
+ Album: str
50
+ Artist: str
51
+ text: str = embeddings.SourceField()
52
+ vector: Vector(embeddings.ndims()) = embeddings.VectorField()
53
+
54
+ table = db.create_table("Songs", schema=Songs)
55
+ table.add(data=df)
56
+
57
+ import os
58
+ from typing import Optional
59
+
60
+ from langchain_community.chat_models import ChatOpenAI
61
+
62
+ class ChatOpenRouter(ChatOpenAI):
63
+ openai_api_base: str
64
+ openai_api_key: str
65
+ model_name: str
66
+
67
+ def __init__(
68
+ self,
69
+ model_name: str,
70
+ openai_api_key: Optional[str] = None,
71
+ openai_api_base: str = "https://openrouter.ai/api/v1",
72
+ **kwargs,
73
+ ):
74
+ openai_api_key = os.getenv("OPENROUTER_API_KEY")
75
+ super().__init__(
76
+ openai_api_base=openai_api_base,
77
+ openai_api_key=openai_api_key,
78
+ model_name=model_name,
79
+ **kwargs,
80
+ )
81
+
82
+ llm_openrouter = ChatOpenRouter(model_name="meta-llama/llama-3.1-405b-instruct", temperature=0.1)
83
+
84
+ def get_relevant_texts(query, table=table):
85
+ results = (
86
+ table.search(query)
87
+ .limit(5)
88
+ .to_polars()
89
+ )
90
+ return " ".join([results["text"][i] + "\n\n---\n\n" for i in range(5)])
91
+
92
+ def generate_prompt(query, table=table):
93
+ return (
94
+ "Answer the question based only on the following context:\n\n"
95
+ + get_relevant_texts(query, table)
96
+ + "\n\nQuestion: "
97
+ + query
98
+ )
99
+
100
+ def generate_response(query, table=table):
101
+ prompt = generate_prompt(query, table)
102
+ response = llm_openrouter.invoke(input=prompt)
103
+ return response.content
104
+
105
+ import kuzu
106
+
107
+ shutil.rmtree("test_kuzudb", ignore_errors=True)
108
+ db = kuzu.Database("test_kuzudb")
109
+ conn = kuzu.Connection(db)
110
+ # Create schema
111
+ conn.execute("CREATE NODE TABLE ARTIST(name STRING, PRIMARY KEY (name))")
112
+ conn.execute("CREATE NODE TABLE ALBUM(name STRING, PRIMARY KEY (name))")
113
+ conn.execute("CREATE NODE TABLE SONG(ID SERIAL, name STRING, lyrics STRING, PRIMARY KEY(ID))")
114
+ conn.execute("CREATE REL TABLE IN_ALBUM(FROM SONG TO ALBUM)")
115
+ conn.execute("CREATE REL TABLE FROM_ARTIST(FROM ALBUM TO ARTIST)");
116
+
117
+ # Insert nodes
118
+ for artist in df["Artist"].unique():
119
+ conn.execute(f"CREATE (artist:ARTIST {{name: '{artist}'}})")
120
+
121
+ for album in df["Album"].unique():
122
+ conn.execute(f"""CREATE (album:ALBUM {{name: "{album}"}})""")
123
+
124
+ for song, lyrics in df.select(["Song", "text"]).unique().rows():
125
+ replaced_lyrics = lyrics.replace('"', "'")
126
+ conn.execute(
127
+ f"""CREATE (song:SONG {{name: "{song}", lyrics: "{replaced_lyrics}"}})"""
128
+ )
129
+
130
+ # Insert edges
131
+ for song, album, lyrics in df.select(["Song", "Album", "text"]).rows():
132
+ replaced_lyrics = lyrics.replace('"', "'")
133
+ conn.execute(
134
+ f"""
135
+ MATCH (song:SONG), (album:ALBUM)
136
+ WHERE song.name = "{song}" AND song.lyrics = "{replaced_lyrics}" AND album.name = "{album}"
137
+ CREATE (song)-[:IN_ALBUM]->(album)
138
+ """
139
+ )
140
+
141
+ for album, artist in df.select(["Album", "Artist"]).unique().rows():
142
+ conn.execute(
143
+ f"""
144
+ MATCH (album:ALBUM), (artist:ARTIST) WHERE album.name = "{album}" AND artist.name = "{artist}"
145
+ CREATE (album)-[:FROM_ARTIST]->(artist)
146
+ """
147
+ )
148
+
149
+ response = conn.execute(
150
+ """
151
+ MATCH (a:ALBUM {name: 'The Black Album'})<-[:IN_ALBUM]-(s:SONG) RETURN s.name
152
+ """
153
+ )
154
+
155
+ df_response = response.get_as_pl()
156
+
157
+ from langchain_community.graphs import KuzuGraph
158
+
159
+ graph = KuzuGraph(db)
160
+
161
+ def generate_kuzu_prompt(user_query):
162
+ return """Task: Generate Kùzu Cypher statement to query a graph database.
163
+
164
+ Instructions:
165
+ Generate the Kùzu dialect of Cypher with the following rules in mind:
166
+ 1. Do not omit the relationship pattern. Always use `()-[]->()` instead of `()->()`.
167
+ 2. Do not include triple backticks ``` in your response. Return only Cypher.
168
+ 3. Do not return any notes or comments in your response.
169
+
170
+
171
+ Use only the provided relationship types and properties in the schema.
172
+ Do not use any other relationship types or properties that are not provided.
173
+
174
+ Schema:\n""" + graph.get_schema + """\nExample:
175
+ The question is:\n"Which songs does the load album have?"
176
+ MATCH (a:ALBUM {name: 'Load'})<-[:IN_ALBUM]-(s:SONG) RETURN s.name
177
+
178
+ Note: Do not include any explanations or apologies in your responses.
179
+ Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
180
+ Do not include any text except the generated Cypher statement.
181
+
182
+ The question is:\n""" + user_query
183
+
184
+
185
+ def generate_final_prompt(query,cypher_query,col_name,_values):
186
+ return f"""You are an assistant that helps to form nice and human understandable answers.
187
+ The information part contains the provided information that you must use to construct an answer.
188
+ The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
189
+ Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
190
+ Here is an example:
191
+
192
+ Question: Which managers own Neo4j stocks?
193
+ Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
194
+ Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
195
+
196
+ Follow this example when generating answers.
197
+ If the provided information is empty, say that you don't know the answer.
198
+ Query:\n{cypher_query}
199
+ Information:
200
+ [{col_name}: {_values}]
201
+
202
+ Question: {query}
203
+ Helpful Answer:
204
+ """
205
+
206
+ def generate_kg_response(query):
207
+ prompt = generate_kuzu_prompt(query)
208
+ cypher_query_response = llm_openrouter.invoke(input=prompt)
209
+ cypher_query = cypher_query_response.content
210
+ response = conn.execute(
211
+ f"""
212
+ {cypher_query}
213
+ """
214
+ )
215
+ df = response.get_as_pl()
216
+ col_name = df.columns[0]
217
+ _values = df[col_name].to_list()
218
+ final_prompt = generate_final_prompt(query,cypher_query,col_name,_values)
219
+ final_response = llm_openrouter.invoke(input=final_prompt)
220
+ final_response = final_response.content
221
+ return final_response, cypher_query
222
+
223
+ def get_classification(query):
224
+ prompt = "Answer only YES or NO. Is the question '" + query + "' related to the content of a song?"
225
+ response = llm_openrouter.invoke(input=prompt)
226
+ return response.content
227
+
228
+ query = solara.reactive("How many songs does the black album have?")
229
+ @solara.component
230
+ def Page():
231
+ with solara.Column(margin=10):
232
+ solara.Markdown("# Metallica Song Finder Graph RAG")
233
+ solara.InputText("Enter some query:", query, continuous_update=False)
234
+ if query.value != "":
235
+ query_class = get_classification(query.value)
236
+ if query_class == 'YES' or query_class == 'YES.':
237
+ df_results = table.search(query.value).limit(5).to_polars()
238
+ df_results = df_results.select(['Song', 'Album', '_distance', 'Lyrics', 'Artist'])
239
+ response = generate_response(query.value)
240
+ solara.Markdown("## Answer:")
241
+ solara.Markdown(response)
242
+ solara.Markdown("## Context:")
243
+ solara.DataFrame(df_results, items_per_page=5)
244
+ else:
245
+ response, cypher_query = generate_kg_response(query.value)
246
+ solara.Markdown("## Answer:")
247
+ solara.Markdown(response)
248
+ solara.Markdown("## Cypher query:")
249
+ solara.Markdown(cypher_query)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ solara==1.39.0
2
+ polars==1.7.1
3
+ lancedb==0.13.0
4
+ sentence-transformers==3.1.1
5
+ langchain-community==0.3.0
6
+ openai==1.47.1
7
+ kuzu==0.6.0
test_kuzudb/.lock ADDED
File without changes
test_kuzudb/.shadow ADDED
File without changes
test_kuzudb/.wal ADDED
Binary file (191 kB). View file
 
test_kuzudb/catalog.kz ADDED
Binary file (76 Bytes). View file
 
test_kuzudb/data.kz ADDED
File without changes
test_kuzudb/metadata.kz ADDED
File without changes
test_kuzudb/n-0.hindex ADDED
File without changes
test_kuzudb/n-0.hindex.ovf ADDED
File without changes
test_kuzudb/n-1.hindex ADDED
File without changes
test_kuzudb/n-1.hindex.ovf ADDED
File without changes
test_kuzudb/n-2.hindex ADDED
File without changes
test_lancedb/Songs.lance/_transactions/0-dfebadff-831b-4154-b17e-51133a404c7c.txn ADDED
@@ -0,0 +1 @@
 
 
1
+ $dfebadff-831b-4154-b17e-51133a404c7c��Song ���������*string8Lyrics ���������*string8Album ���������*string8Artist ���������*string8text ���������*string82vector ���������*fixed_size_list:float:3848
test_lancedb/Songs.lance/_transactions/1-7fdeab54-9198-482c-8b47-53a30605baf8.txn ADDED
Binary file (104 Bytes). View file
 
test_lancedb/Songs.lance/_versions/1.manifest ADDED
Binary file (616 Bytes). View file
 
test_lancedb/Songs.lance/_versions/2.manifest ADDED
Binary file (677 Bytes). View file
 
test_lancedb/Songs.lance/data/b4767451-652d-4765-9ab4-ddc50aa92be7.lance ADDED
Binary file (521 kB). View file