Skip to content

Commit dfb4e31

Browse files
committed
Update pdf loader to also text loader
1 parent bb42836 commit dfb4e31

1 file changed

Lines changed: 9 additions & 1 deletion

File tree

rabbithole/loader.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,15 @@ def load_file(file: UploadedFile) -> list[Document]:
4343
# Handle .pdf files
4444
elif file.name.endswith(".pdf"):
4545
temp_file = save_to_temp_file(file)
46-
return PyMuPDFLoader(file_path=temp_file).load_and_split(text_splitter=text_splitter)
46+
pdf_doc = PyMuPDFLoader(file_path=temp_file).load_and_split(text_splitter=text_splitter)
47+
48+
# Save transcription to temporary file
49+
temp_file = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
50+
temp_file.write("\n".join([page.page_content for page in pdf_doc]).encode())
51+
temp_file.close()
52+
53+
# Load the file using TextLoader
54+
return TextLoader(file_path=temp_file.name, encoding="utf-8").load_and_split(text_splitter=text_splitter)
4755

4856
# Handle .txt files
4957
elif file.name.endswith(".txt"):

0 commit comments

Comments
 (0)