Spaces:
Sleeping
Sleeping
LordFarquaad42
commited on
Commit
•
58964c1
1
Parent(s):
257cb4c
please work
Browse files- .gitattributes +3 -0
- chromadb_linux/chroma.sqlite3 +3 -0
- chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/data_level0.bin +3 -0
- chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/header.bin +3 -0
- chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/index_metadata.pickle +3 -0
- chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/length.bin +3 -0
- chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/link_lists.bin +3 -0
- chromadb_linux_two/chroma.sqlite3 +0 -0
- data/Daniel P. Friedman, Matthias Felleisen, Duane Bibby, Gerald J. Sussman - The Little Schemer-The MIT Press (1995).pdf +3 -0
- data/Felleisen, Matthias_ Friedman, Daniel P. - The seasoned schemer-MIT Press (1996).pdf +3 -0
- data/Harold Abelson, Gerald Jay Sussman - Structure and Interpretation of Computer Programs - 2nd Edition (MIT Electrical Engineering and Computer Science)-The MIT Press (1996).pdf +3 -0
- data/Induction Review Set 1.pdf +3 -0
- data/Induction Review Set 2.pdf +3 -0
- data/Logic Review Set 1.pdf +3 -0
- data/r5rs.pdf +3 -0
- test.py +76 -0
.gitattributes
CHANGED
@@ -39,3 +39,6 @@ chromadb/a81ece71-a3dd-473b-b74b-da3ab01ee2b8/data_level0.bin filter=lfs diff=lf
|
|
39 |
chromadb_linux/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
40 |
data/Daniel[[:space:]]P.[[:space:]]Friedman,[[:space:]]Matthias[[:space:]]Felleisen,[[:space:]]Duane[[:space:]]Bibby,[[:space:]]Gerald[[:space:]]J.[[:space:]]Sussman[[:space:]]-[[:space:]]The[[:space:]]Little[[:space:]]Schemer-The[[:space:]]MIT[[:space:]]Press[[:space:]](1995).pdf filter=lfs diff=lfs merge=lfs -text
|
41 |
data/*.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
39 |
chromadb_linux/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
40 |
data/Daniel[[:space:]]P.[[:space:]]Friedman,[[:space:]]Matthias[[:space:]]Felleisen,[[:space:]]Duane[[:space:]]Bibby,[[:space:]]Gerald[[:space:]]J.[[:space:]]Sussman[[:space:]]-[[:space:]]The[[:space:]]Little[[:space:]]Schemer-The[[:space:]]MIT[[:space:]]Press[[:space:]](1995).pdf filter=lfs diff=lfs merge=lfs -text
|
41 |
data/*.pdf filter=lfs diff=lfs merge=lfs -text
|
42 |
+
chromadb_linux filter=lfs diff=lfs merge=lfs -text
|
43 |
+
chromadb_linux_two/ filter=lfs diff=lfs merge=lfs -text
|
44 |
+
data/ filter=lfs diff=lfs merge=lfs -text
|
chromadb_linux/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:714af04cf0e3baa968ac6612f2a733861d1ab80d0370d981b3f93fcabf281af5
|
3 |
+
size 16764928
|
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:359affea33231672e7e0825e18df619339bb0223c39023838a0b5b1fc1263ba8
|
3 |
+
size 4236000
|
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd60240654bf037f048c3f40d171a0c44587e74ab4005c6e54526d4bbc0b8a66
|
3 |
+
size 100
|
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9ee14b7bffbe45f0c52ae28b7656afe27753caf110ef94db608c7a91f7a39279
|
3 |
+
size 32742
|
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b301c1e683052008fd9378a67c762272530b5647ceb880f0238dca5cd5b34c5f
|
3 |
+
size 4000
|
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2d4cb8307f87b4cd2cae69a9a8b24710c3deb7ab97ae915ade4b01b3fab7fb0
|
3 |
+
size 8420
|
chromadb_linux_two/chroma.sqlite3
ADDED
Binary file (147 kB). View file
|
|
data/Daniel P. Friedman, Matthias Felleisen, Duane Bibby, Gerald J. Sussman - The Little Schemer-The MIT Press (1995).pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:824949fc2ca18cb942643d61188cda8c1c9a1d13c052210c6b44b27384254d51
|
3 |
+
size 10640611
|
data/Felleisen, Matthias_ Friedman, Daniel P. - The seasoned schemer-MIT Press (1996).pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4de53a8c10faf0512ad13d5b1c8a7057e7154ba77b39d8ae63aa64298d8fdbe2
|
3 |
+
size 12949205
|
data/Harold Abelson, Gerald Jay Sussman - Structure and Interpretation of Computer Programs - 2nd Edition (MIT Electrical Engineering and Computer Science)-The MIT Press (1996).pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:209867ddeb007f25bbdc5b552d16be82fb5a812a8185cd0bc6502975f7f27b77
|
3 |
+
size 4621760
|
data/Induction Review Set 1.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ea45101c479bf1b80b37d3a3f1de401bf1d5294d2e26a0fcfa2d42a42a04ba4
|
3 |
+
size 92528
|
data/Induction Review Set 2.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:36a3fe8b365b57704d9f0696811a2784cbb27d319afd0acc5dad8905c61b0807
|
3 |
+
size 86368
|
data/Logic Review Set 1.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a22544ee69338df4cb147966ee1ba26fe7316f98388f53568527030fec983a65
|
3 |
+
size 85513
|
data/r5rs.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09b71fe4373610d763e86a728ec80146e391a1cd9c00341364200ce3b2e2bc97
|
3 |
+
size 572547
|
test.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
from chromadb.utils import embedding_functions
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from pypdf import PdfReader as reader
|
5 |
+
import os
|
6 |
+
|
7 |
+
# experiment with larger models
|
8 |
+
MODEL_NAME = "Salesforce/SFR-Embedding-Mistral" # ~ 1.2 gb
|
9 |
+
DISTANCE_FUNCTION = "cosine"
|
10 |
+
COLLECTION_NAME = "scheme"
|
11 |
+
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)
|
12 |
+
client = chromadb.PersistentClient(path="./chromadb_linux_two/")
|
13 |
+
print("Getting Collection")
|
14 |
+
|
15 |
+
schemer = client.create_collection(
|
16 |
+
name=COLLECTION_NAME,
|
17 |
+
embedding_function=EMBEDDING_FUNC,
|
18 |
+
)
|
19 |
+
print(f"Number enteries in collection: {schemer.count()}")
|
20 |
+
|
21 |
+
|
22 |
+
###########################################################################
|
23 |
+
def get_text(pdf_path: str) -> str:
|
24 |
+
doc = reader(pdf_path)
|
25 |
+
text_content = ''
|
26 |
+
|
27 |
+
for page in range(len(doc.pages)):
|
28 |
+
page = doc.pages[page]
|
29 |
+
text_content += page.extract_text()
|
30 |
+
return text_content
|
31 |
+
|
32 |
+
def clean_text(text: str)-> str:
|
33 |
+
return text.replace('\n', ' ')
|
34 |
+
|
35 |
+
files = os.listdir('./data/')
|
36 |
+
dataset = []
|
37 |
+
|
38 |
+
for file in files:
|
39 |
+
if file.endswith(".pdf"):
|
40 |
+
text_content = str(get_text(os.path.join('data', file)))
|
41 |
+
dataset.append(text_content)
|
42 |
+
print(file)
|
43 |
+
|
44 |
+
batch_size = 1024
|
45 |
+
padding_element = '.'
|
46 |
+
batch_documents = []
|
47 |
+
batch_ids = []
|
48 |
+
batch_metadata = []
|
49 |
+
|
50 |
+
for i, document in enumerate(dataset):
|
51 |
+
|
52 |
+
# entering each batch
|
53 |
+
for j in range(0, len(document), batch_size):
|
54 |
+
try:
|
55 |
+
j_end = min(j + batch_size, len(document))
|
56 |
+
batch = document[j:min(j+batch_size, len(document))]
|
57 |
+
|
58 |
+
if len(batch) < batch_size: # Extend the batch with the padding elements
|
59 |
+
padding_needed = batch_size - len(batch)
|
60 |
+
batch = batch + str(padding_element * padding_needed)
|
61 |
+
|
62 |
+
print(f"Doc {i+1}/{len(dataset)}: Batch {j}/{len(document)}")
|
63 |
+
text = clean_text(batch)
|
64 |
+
batch_documents.append(text)
|
65 |
+
batch_ids.append(f'batch{i}{j}{batch[0]}')
|
66 |
+
batch_metadata.append({"length": len(batch)})
|
67 |
+
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Error processing batch {j} of document {i}: {e}")
|
70 |
+
|
71 |
+
print("Upserting into collection")
|
72 |
+
schemer.upsert(
|
73 |
+
ids=[str(id) for id in batch_ids],
|
74 |
+
metadatas=batch_metadata,
|
75 |
+
documents=batch_documents,
|
76 |
+
)
|