Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +7 -7
split_files_to_excel.py
CHANGED
@@ -347,7 +347,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
347 |
# print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
|
348 |
continue
|
349 |
elif skip_next:
|
350 |
-
split_doc = Document(page_content=tokenizer.decode(encoded), metadata=doc.metadata.copy())
|
351 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
352 |
resized.append(split_doc)
|
353 |
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
|
@@ -371,7 +371,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
371 |
# print("not len(remaining_encoded)>max_length + min_chunk_size")
|
372 |
current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
|
373 |
is_last_chunk = True
|
374 |
-
split_doc = Document(page_content=tokenizer.decode(current_encoded), metadata=doc.metadata.copy())
|
375 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
376 |
resized.append(split_doc)
|
377 |
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
|
@@ -390,18 +390,18 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
390 |
# print("not is_first_chunk", period_index_b)
|
391 |
if period_index_b == -1:# Period not found in overlap
|
392 |
# print(". not found in overlap")
|
393 |
-
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # Keep regular splitting
|
394 |
else:
|
395 |
if is_last_chunk : #not the first but the last
|
396 |
# print("is_last_chunk")
|
397 |
-
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:], metadata=doc.metadata.copy())
|
398 |
#print("Should start after \".\"")
|
399 |
else:
|
400 |
# print("not is_last_chunk", period_index_e, len(to_encode))
|
401 |
-
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e], metadata=doc.metadata.copy()) # Split at the begining and the end
|
402 |
else:#first chunk
|
403 |
# print("else")
|
404 |
-
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e], metadata=doc.metadata.copy()) # split only at the end if its first chunk
|
405 |
if 'titles' in split_doc.metadata:
|
406 |
# print("title in metadata")
|
407 |
chunk_counter += 1
|
@@ -425,7 +425,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
425 |
chunk_counter += 1
|
426 |
doc.metadata['chunk_id'] = chunk_counter
|
427 |
doc.metadata['token_length'] = len(encoded)
|
428 |
-
doc.page_content = tokenizer.decode(encoded)
|
429 |
resized.append(doc)
|
430 |
print(f"Added a document of {doc.metadata['token_length']} tokens 4")
|
431 |
print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
|
|
|
347 |
# print(f"len(encoded):{len(encoded)}<min_chunk_size:{min_chunk_size}")
|
348 |
continue
|
349 |
elif skip_next:
|
350 |
+
split_doc = Document(page_content=tokenizer.decode(encoded).replace('<s> ', ''), metadata=doc.metadata.copy())
|
351 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
352 |
resized.append(split_doc)
|
353 |
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 1")
|
|
|
371 |
# print("not len(remaining_encoded)>max_length + min_chunk_size")
|
372 |
current_encoded = remaining_encoded #if the last chunk is to small, concatenate it with the previous one
|
373 |
is_last_chunk = True
|
374 |
+
split_doc = Document(page_content=tokenizer.decode(current_encoded).replace('<s> ', ''), metadata=doc.metadata.copy())
|
375 |
split_doc.metadata['token_length'] = len(tokenizer.encode(split_doc.page_content))
|
376 |
resized.append(split_doc)
|
377 |
# print(f"Added a document of {split_doc.metadata['token_length']} tokens 2")
|
|
|
390 |
# print("not is_first_chunk", period_index_b)
|
391 |
if period_index_b == -1:# Period not found in overlap
|
392 |
# print(". not found in overlap")
|
393 |
+
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # Keep regular splitting
|
394 |
else:
|
395 |
if is_last_chunk : #not the first but the last
|
396 |
# print("is_last_chunk")
|
397 |
+
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:].replace('<s> ', ''), metadata=doc.metadata.copy())
|
398 |
#print("Should start after \".\"")
|
399 |
else:
|
400 |
# print("not is_last_chunk", period_index_e, len(to_encode))
|
401 |
+
split_doc = Document(page_content=tokenizer.decode(current_encoded)[period_index_b+1:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # Split at the begining and the end
|
402 |
else:#first chunk
|
403 |
# print("else")
|
404 |
+
split_doc = Document(page_content=tokenizer.decode(current_encoded)[:period_index_e].replace('<s> ', ''), metadata=doc.metadata.copy()) # split only at the end if its first chunk
|
405 |
if 'titles' in split_doc.metadata:
|
406 |
# print("title in metadata")
|
407 |
chunk_counter += 1
|
|
|
425 |
chunk_counter += 1
|
426 |
doc.metadata['chunk_id'] = chunk_counter
|
427 |
doc.metadata['token_length'] = len(encoded)
|
428 |
+
doc.page_content = tokenizer.decode(encoded).replace('<s> ', '')
|
429 |
resized.append(doc)
|
430 |
print(f"Added a document of {doc.metadata['token_length']} tokens 4")
|
431 |
print(f"Number of chunks before resplitting: {len(documents)} \nAfter splitting: {len(resized)}")
|