Skip to content

Commit bb42836

Browse files
committed
Update text_splitter to TokenTextSplitter
1 parent 2ae5e45 commit bb42836

1 file changed

Lines changed: 2 additions & 2 deletions

File tree

rabbithole/loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import streamlit as st
55
from langchain.document_loaders import Docx2txtLoader, PyMuPDFLoader, TextLoader, UnstructuredImageLoader
66
from langchain.schema import Document
7-
from langchain.text_splitter import CharacterTextSplitter
7+
from langchain.text_splitter import TokenTextSplitter
88
from streamlit.runtime.uploaded_file_manager import UploadedFile
99

1010
from rabbithole.mp3 import SUPPORTED_AV_FILE_TYPES, convert_to_mp3
@@ -33,7 +33,7 @@ def load_file(file: UploadedFile) -> list[Document]:
3333
Supported file types: PDF
3434
:return: List of Document objects
3535
"""
36-
text_splitter = CharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
36+
text_splitter = TokenTextSplitter(model_name="davinci", chunk_size=2000, chunk_overlap=100)
3737

3838
# Handle .docx files
3939
if file.name.endswith(".docx"):

0 commit comments

Comments
 (0)