Spaces:

LordFarquaad42
/

Groove-GPT

Sleeping

App Files Files Community

LordFarquaad42 commited on Mar 15

Commit

58964c1

•

1 Parent(s): 257cb4c

please work

Browse files

Files changed (16) hide show

.gitattributes +3 -0
chromadb_linux/chroma.sqlite3 +3 -0
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/data_level0.bin +3 -0
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/header.bin +3 -0
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/index_metadata.pickle +3 -0
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/length.bin +3 -0
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/link_lists.bin +3 -0
chromadb_linux_two/chroma.sqlite3 +0 -0
data/Daniel P. Friedman, Matthias Felleisen, Duane Bibby, Gerald J. Sussman - The Little Schemer-The MIT Press (1995).pdf +3 -0
data/Felleisen, Matthias_ Friedman, Daniel P. - The seasoned schemer-MIT Press (1996).pdf +3 -0
data/Harold Abelson, Gerald Jay Sussman - Structure and Interpretation of Computer Programs - 2nd Edition (MIT Electrical Engineering and Computer Science)-The MIT Press (1996).pdf +3 -0
data/Induction Review Set 1.pdf +3 -0
data/Induction Review Set 2.pdf +3 -0
data/Logic Review Set 1.pdf +3 -0
data/r5rs.pdf +3 -0
test.py +76 -0

.gitattributes CHANGED Viewed

@@ -39,3 +39,6 @@ chromadb/a81ece71-a3dd-473b-b74b-da3ab01ee2b8/data_level0.bin filter=lfs diff=lf
 chromadb_linux/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
 data/Daniel[[:space:]]P.[[:space:]]Friedman,[[:space:]]Matthias[[:space:]]Felleisen,[[:space:]]Duane[[:space:]]Bibby,[[:space:]]Gerald[[:space:]]J.[[:space:]]Sussman[[:space:]]-[[:space:]]The[[:space:]]Little[[:space:]]Schemer-The[[:space:]]MIT[[:space:]]Press[[:space:]](1995).pdf filter=lfs diff=lfs merge=lfs -text
 data/*.pdf filter=lfs diff=lfs merge=lfs -text

 chromadb_linux/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
 data/Daniel[[:space:]]P.[[:space:]]Friedman,[[:space:]]Matthias[[:space:]]Felleisen,[[:space:]]Duane[[:space:]]Bibby,[[:space:]]Gerald[[:space:]]J.[[:space:]]Sussman[[:space:]]-[[:space:]]The[[:space:]]Little[[:space:]]Schemer-The[[:space:]]MIT[[:space:]]Press[[:space:]](1995).pdf filter=lfs diff=lfs merge=lfs -text
 data/*.pdf filter=lfs diff=lfs merge=lfs -text
+chromadb_linux filter=lfs diff=lfs merge=lfs -text
+chromadb_linux_two/ filter=lfs diff=lfs merge=lfs -text
+data/ filter=lfs diff=lfs merge=lfs -text

chromadb_linux/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:714af04cf0e3baa968ac6612f2a733861d1ab80d0370d981b3f93fcabf281af5
+size 16764928

chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:359affea33231672e7e0825e18df619339bb0223c39023838a0b5b1fc1263ba8
+size 4236000

chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd60240654bf037f048c3f40d171a0c44587e74ab4005c6e54526d4bbc0b8a66
+size 100

chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ee14b7bffbe45f0c52ae28b7656afe27753caf110ef94db608c7a91f7a39279
+size 32742

chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b301c1e683052008fd9378a67c762272530b5647ceb880f0238dca5cd5b34c5f
+size 4000

chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2d4cb8307f87b4cd2cae69a9a8b24710c3deb7ab97ae915ade4b01b3fab7fb0
+size 8420

chromadb_linux_two/chroma.sqlite3 ADDED Viewed

Binary file (147 kB). View file

data/Daniel P. Friedman, Matthias Felleisen, Duane Bibby, Gerald J. Sussman - The Little Schemer-The MIT Press (1995).pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:824949fc2ca18cb942643d61188cda8c1c9a1d13c052210c6b44b27384254d51
+size 10640611

data/Felleisen, Matthias_ Friedman, Daniel P. - The seasoned schemer-MIT Press (1996).pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4de53a8c10faf0512ad13d5b1c8a7057e7154ba77b39d8ae63aa64298d8fdbe2
+size 12949205

data/Harold Abelson, Gerald Jay Sussman - Structure and Interpretation of Computer Programs - 2nd Edition (MIT Electrical Engineering and Computer Science)-The MIT Press (1996).pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:209867ddeb007f25bbdc5b552d16be82fb5a812a8185cd0bc6502975f7f27b77
+size 4621760

data/Induction Review Set 1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea45101c479bf1b80b37d3a3f1de401bf1d5294d2e26a0fcfa2d42a42a04ba4
+size 92528

data/Induction Review Set 2.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36a3fe8b365b57704d9f0696811a2784cbb27d319afd0acc5dad8905c61b0807
+size 86368

data/Logic Review Set 1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a22544ee69338df4cb147966ee1ba26fe7316f98388f53568527030fec983a65
+size 85513

data/r5rs.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09b71fe4373610d763e86a728ec80146e391a1cd9c00341364200ce3b2e2bc97
+size 572547

test.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import chromadb
+from chromadb.utils import embedding_functions
+from sentence_transformers import SentenceTransformer
+from pypdf import PdfReader as reader
+import os
+# experiment with larger models
+MODEL_NAME = "Salesforce/SFR-Embedding-Mistral" # ~ 1.2 gb
+DISTANCE_FUNCTION = "cosine"
+COLLECTION_NAME = "scheme"
+EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)
+client = chromadb.PersistentClient(path="./chromadb_linux_two/")
+print("Getting Collection")
+schemer = client.create_collection(
+    name=COLLECTION_NAME,
+    embedding_function=EMBEDDING_FUNC,
+)
+print(f"Number enteries in collection: {schemer.count()}")
+###########################################################################
+def get_text(pdf_path: str) -> str:
+    doc = reader(pdf_path)
+    text_content = ''
+    for page in range(len(doc.pages)):
+        page = doc.pages[page]
+        text_content += page.extract_text()
+    return text_content
+def clean_text(text: str)-> str:
+    return text.replace('\n', ' ')
+files = os.listdir('./data/')
+dataset = []
+for file in files:
+    if file.endswith(".pdf"):
+        text_content = str(get_text(os.path.join('data', file)))
+        dataset.append(text_content)
+        print(file)
+batch_size = 1024
+padding_element = '.'
+batch_documents = []
+batch_ids = []
+batch_metadata = []
+for i, document in enumerate(dataset):
+    # entering each batch
+    for j in range(0, len(document), batch_size):
+        try:
+            j_end = min(j + batch_size, len(document))
+            batch = document[j:min(j+batch_size, len(document))]
+            if len(batch) < batch_size: # Extend the batch with the padding elements
+                padding_needed = batch_size - len(batch)
+                batch = batch + str(padding_element * padding_needed)
+            print(f"Doc {i+1}/{len(dataset)}: Batch {j}/{len(document)}")
+            text = clean_text(batch)
+            batch_documents.append(text)
+            batch_ids.append(f'batch{i}{j}{batch[0]}')
+            batch_metadata.append({"length": len(batch)})
+        except Exception as e:
+            print(f"Error processing batch {j} of document {i}: {e}")
+print("Upserting into collection")
+schemer.upsert(
+    ids=[str(id) for id in batch_ids],
+    metadatas=batch_metadata,
+    documents=batch_documents,
+    )