-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathapp.py
More file actions
130 lines (108 loc) · 4.84 KB
/
Copy pathapp.py
File metadata and controls
130 lines (108 loc) · 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Streamlit App"""
import streamlit as st
from langchain.schema import Document
from rabbithole import summarize_document
from rabbithole.embedding import embed_document
from rabbithole.keywords import get_document_keywords
from rabbithole.loader import load_file, SUPPORTED_IMG_FILE_TYPES
from rabbithole.mp3 import SUPPORTED_AV_FILE_TYPES
from rabbithole.planner import generate_plan
# Global variables
global_documents = {}
global_embeddings = {}
global_keywords = {}
global_summaries = {}
def load_files_with_spinner(files: list) -> dict[str, list[Document]]:
"""
Load a list of files and return a list of dictionaries of Document objects.
Display a loading animation while loading each file.
:param files: List of files to load.
:return: List of dictionaries of Document objects.
"""
# Combine the results into a single dictionary
documents = {}
for file in files:
with st.spinner(f'Loading {file.name}...'):
documents[file.name] = load_file(file)
print(file.name, [len(doc.page_content) for doc in documents[file.name]])
return documents
def embed_documents_with_spinner(documents: dict[str, list[Document]]) -> dict[str, list[list[float]]]:
"""
Embed a list of documents and return a list of dictionaries of embeddings.
Display a loading animation while embedding each document.
:param documents: List of documents to embed.
:return: List of dictionaries of embeddings.
"""
# Combine the results into a single dictionary
embeddings = {}
for doc_name, doc_text in documents.items():
with st.spinner(f'Embedding {doc_name}...'):
embeddings[doc_name] = embed_document([doc.page_content for doc in doc_text])
return embeddings
def extract_keywords_with_spinner(embeddings: dict[str, list[list[float]]]):
"""
Extract keywords from a list of embeddings and return a list of keywords.
Display a loading animation while extracting each keyword.
:param embeddings: List of embeddings to extract keywords from.
:return: List of keywords.
"""
# Combine the results into a single dictionary
keywords = {}
for doc_name, doc_embeddings in embeddings.items():
with st.spinner(f'Extracting keywords from {doc_name}...'):
keywords[doc_name] = get_document_keywords(doc_embeddings)
return keywords
def generate_summary_with_spinner(documents: dict[str, list[Document]]) -> dict[str, list[list[float]]]:
"""
Embed a list of documents and return a list of dictionaries of embeddings.
Display a loading animation while embedding each document.
:param documents: List of documents to embed.
:return: List of dictionaries of embeddings.
"""
summaries = {}
for doc_name, doc_text in documents.items():
with st.spinner(f'Summarizing {doc_name}...'):
summaries[doc_name] = summarize_document(doc_text)
return summaries
def generate_plan_with_spinner() -> dict:
"""Generate a logical plan to study the uploaded documents."""
with st.spinner("Generating plan..."):
plan = generate_plan(global_summaries, global_keywords)
return plan
st.set_page_config(page_title="RabbitHole", page_icon="🐇", layout="wide")
st.title("RabbitHole")
uploaded_files = st.file_uploader("Upload content",
type=["docx", "pdf", "txt", *SUPPORTED_IMG_FILE_TYPES, *SUPPORTED_AV_FILE_TYPES],
accept_multiple_files=True)
if st.button("Dive in"):
if not uploaded_files:
st.warning("Please upload a file first.")
st.stop()
# Load the text from the uploaded PDF files
global_documents = load_files_with_spinner(uploaded_files)
global_embeddings = embed_documents_with_spinner(global_documents)
global_keywords = extract_keywords_with_spinner(global_embeddings)
global_summaries = generate_summary_with_spinner(global_documents)
# Display the keywords and summaries
for doc_name, doc_keywords in global_keywords.items():
st.header(doc_name)
st.caption("Keywords: " + ", ".join(doc_keywords))
st.write(global_summaries[doc_name])
st.divider()
# Display the plan
st.header("Study Plan")
plan = generate_plan_with_spinner()
for data in plan.get("plan", []):
for doc_name, doc_data in data.items():
st.subheader(doc_name)
st.write(f"**Background Concepts**")
for concept in doc_data.get("Background Concepts", []):
st.write(f"- {concept}")
st.write(f"**Key Concepts**")
for concept in doc_data.get("Key Concepts", []):
st.write(f"- {concept}")
st.write(f"**Further Reading**")
for concept in doc_data.get("Further Reading", []):
st.write(f"- {concept}")
st.write("")
st.success('Summarization completed.')