LordFarquaad42 commited on
Commit
58964c1
1 Parent(s): 257cb4c

please work

Browse files
.gitattributes CHANGED
@@ -39,3 +39,6 @@ chromadb/a81ece71-a3dd-473b-b74b-da3ab01ee2b8/data_level0.bin filter=lfs diff=lf
39
  chromadb_linux/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
40
  data/Daniel[[:space:]]P.[[:space:]]Friedman,[[:space:]]Matthias[[:space:]]Felleisen,[[:space:]]Duane[[:space:]]Bibby,[[:space:]]Gerald[[:space:]]J.[[:space:]]Sussman[[:space:]]-[[:space:]]The[[:space:]]Little[[:space:]]Schemer-The[[:space:]]MIT[[:space:]]Press[[:space:]](1995).pdf filter=lfs diff=lfs merge=lfs -text
41
  data/*.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
39
  chromadb_linux/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
40
  data/Daniel[[:space:]]P.[[:space:]]Friedman,[[:space:]]Matthias[[:space:]]Felleisen,[[:space:]]Duane[[:space:]]Bibby,[[:space:]]Gerald[[:space:]]J.[[:space:]]Sussman[[:space:]]-[[:space:]]The[[:space:]]Little[[:space:]]Schemer-The[[:space:]]MIT[[:space:]]Press[[:space:]](1995).pdf filter=lfs diff=lfs merge=lfs -text
41
  data/*.pdf filter=lfs diff=lfs merge=lfs -text
42
+ chromadb_linux filter=lfs diff=lfs merge=lfs -text
43
+ chromadb_linux_two/ filter=lfs diff=lfs merge=lfs -text
44
+ data/ filter=lfs diff=lfs merge=lfs -text
chromadb_linux/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:714af04cf0e3baa968ac6612f2a733861d1ab80d0370d981b3f93fcabf281af5
3
+ size 16764928
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:359affea33231672e7e0825e18df619339bb0223c39023838a0b5b1fc1263ba8
3
+ size 4236000
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd60240654bf037f048c3f40d171a0c44587e74ab4005c6e54526d4bbc0b8a66
3
+ size 100
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ee14b7bffbe45f0c52ae28b7656afe27753caf110ef94db608c7a91f7a39279
3
+ size 32742
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b301c1e683052008fd9378a67c762272530b5647ceb880f0238dca5cd5b34c5f
3
+ size 4000
chromadb_linux/db11bb9f-77bd-41d9-b90b-4225edaac50b/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2d4cb8307f87b4cd2cae69a9a8b24710c3deb7ab97ae915ade4b01b3fab7fb0
3
+ size 8420
chromadb_linux_two/chroma.sqlite3 ADDED
Binary file (147 kB). View file
 
data/Daniel P. Friedman, Matthias Felleisen, Duane Bibby, Gerald J. Sussman - The Little Schemer-The MIT Press (1995).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:824949fc2ca18cb942643d61188cda8c1c9a1d13c052210c6b44b27384254d51
3
+ size 10640611
data/Felleisen, Matthias_ Friedman, Daniel P. - The seasoned schemer-MIT Press (1996).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4de53a8c10faf0512ad13d5b1c8a7057e7154ba77b39d8ae63aa64298d8fdbe2
3
+ size 12949205
data/Harold Abelson, Gerald Jay Sussman - Structure and Interpretation of Computer Programs - 2nd Edition (MIT Electrical Engineering and Computer Science)-The MIT Press (1996).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:209867ddeb007f25bbdc5b552d16be82fb5a812a8185cd0bc6502975f7f27b77
3
+ size 4621760
data/Induction Review Set 1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ea45101c479bf1b80b37d3a3f1de401bf1d5294d2e26a0fcfa2d42a42a04ba4
3
+ size 92528
data/Induction Review Set 2.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36a3fe8b365b57704d9f0696811a2784cbb27d319afd0acc5dad8905c61b0807
3
+ size 86368
data/Logic Review Set 1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a22544ee69338df4cb147966ee1ba26fe7316f98388f53568527030fec983a65
3
+ size 85513
data/r5rs.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b71fe4373610d763e86a728ec80146e391a1cd9c00341364200ce3b2e2bc97
3
+ size 572547
test.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb.utils import embedding_functions
3
+ from sentence_transformers import SentenceTransformer
4
+ from pypdf import PdfReader as reader
5
+ import os
6
+
7
+ # experiment with larger models
8
+ MODEL_NAME = "Salesforce/SFR-Embedding-Mistral" # ~ 1.2 gb
9
+ DISTANCE_FUNCTION = "cosine"
10
+ COLLECTION_NAME = "scheme"
11
+ EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)
12
+ client = chromadb.PersistentClient(path="./chromadb_linux_two/")
13
+ print("Getting Collection")
14
+
15
+ schemer = client.create_collection(
16
+ name=COLLECTION_NAME,
17
+ embedding_function=EMBEDDING_FUNC,
18
+ )
19
+ print(f"Number enteries in collection: {schemer.count()}")
20
+
21
+
22
+ ###########################################################################
23
+ def get_text(pdf_path: str) -> str:
24
+ doc = reader(pdf_path)
25
+ text_content = ''
26
+
27
+ for page in range(len(doc.pages)):
28
+ page = doc.pages[page]
29
+ text_content += page.extract_text()
30
+ return text_content
31
+
32
+ def clean_text(text: str)-> str:
33
+ return text.replace('\n', ' ')
34
+
35
+ files = os.listdir('./data/')
36
+ dataset = []
37
+
38
+ for file in files:
39
+ if file.endswith(".pdf"):
40
+ text_content = str(get_text(os.path.join('data', file)))
41
+ dataset.append(text_content)
42
+ print(file)
43
+
44
+ batch_size = 1024
45
+ padding_element = '.'
46
+ batch_documents = []
47
+ batch_ids = []
48
+ batch_metadata = []
49
+
50
+ for i, document in enumerate(dataset):
51
+
52
+ # entering each batch
53
+ for j in range(0, len(document), batch_size):
54
+ try:
55
+ j_end = min(j + batch_size, len(document))
56
+ batch = document[j:min(j+batch_size, len(document))]
57
+
58
+ if len(batch) < batch_size: # Extend the batch with the padding elements
59
+ padding_needed = batch_size - len(batch)
60
+ batch = batch + str(padding_element * padding_needed)
61
+
62
+ print(f"Doc {i+1}/{len(dataset)}: Batch {j}/{len(document)}")
63
+ text = clean_text(batch)
64
+ batch_documents.append(text)
65
+ batch_ids.append(f'batch{i}{j}{batch[0]}')
66
+ batch_metadata.append({"length": len(batch)})
67
+
68
+ except Exception as e:
69
+ print(f"Error processing batch {j} of document {i}: {e}")
70
+
71
+ print("Upserting into collection")
72
+ schemer.upsert(
73
+ ids=[str(id) for id in batch_ids],
74
+ metadatas=batch_metadata,
75
+ documents=batch_documents,
76
+ )