rabbithole/app.py at 20ef688dcf3f8cae03e58dc637c7bba59fe1185c · punitarani/rabbithole · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Streamlit App"""

import streamlit as st
from langchain.schema import Document

from rabbithole import summarize_document
from rabbithole.embedding import embed_document
from rabbithole.keywords import get_document_keywords
from rabbithole.loader import load_file, SUPPORTED_IMG_FILE_TYPES
from rabbithole.mp3 import SUPPORTED_AV_FILE_TYPES
from rabbithole.planner import generate_plan

# Global variables
global_documents = {}
global_embeddings = {}
global_keywords = {}
global_summaries = {}


def load_files_with_spinner(files: list) -> dict[str, list[Document]]:
    """
    Load a list of files and return a list of dictionaries of Document objects.
    Display a loading animation while loading each file.
    :param files: List of files to load.
    :return: List of dictionaries of Document objects.
    """
    # Combine the results into a single dictionary
    documents = {}
    for file in files:
        with st.spinner(f'Loading {file.name}...'):
            documents[file.name] = load_file(file)
            print(file.name, [len(doc.page_content) for doc in documents[file.name]])
    return documents


def embed_documents_with_spinner(documents: dict[str, list[Document]]) -> dict[str, list[list[float]]]:
    """
    Embed a list of documents and return a list of dictionaries of embeddings.
    Display a loading animation while embedding each document.
    :param documents: List of documents to embed.
    :return: List of dictionaries of embeddings.
    """
    # Combine the results into a single dictionary
    embeddings = {}
    for doc_name, doc_text in documents.items():
        with st.spinner(f'Embedding {doc_name}...'):
            embeddings[doc_name] = embed_document([doc.page_content for doc in doc_text])
    return embeddings


def extract_keywords_with_spinner(embeddings: dict[str, list[list[float]]]):
    """
    Extract keywords from a list of embeddings and return a list of keywords.
    Display a loading animation while extracting each keyword.
    :param embeddings: List of embeddings to extract keywords from.
    :return: List of keywords.
    """
    # Combine the results into a single dictionary
    keywords = {}
    for doc_name, doc_embeddings in embeddings.items():
        with st.spinner(f'Extracting keywords from {doc_name}...'):
            keywords[doc_name] = get_document_keywords(doc_embeddings)
    return keywords


def generate_summary_with_spinner(documents: dict[str, list[Document]]) -> dict[str, list[list[float]]]:
    """
    Embed a list of documents and return a list of dictionaries of embeddings.
    Display a loading animation while embedding each document.
    :param documents: List of documents to embed.
    :return: List of dictionaries of embeddings.
    """
    summaries = {}
    for doc_name, doc_text in documents.items():
        with st.spinner(f'Summarizing {doc_name}...'):
            summaries[doc_name] = summarize_document(doc_text)
    return summaries


def generate_plan_with_spinner() -> dict:
    """Generate a logical plan to study the uploaded documents."""
    with st.spinner("Generating plan..."):
        plan = generate_plan(global_summaries, global_keywords)
    return plan


st.set_page_config(page_title="RabbitHole", page_icon="🐇", layout="wide")

st.title("RabbitHole")

uploaded_files = st.file_uploader("Upload content",
                                  type=["docx", "pdf", "txt", *SUPPORTED_IMG_FILE_TYPES, *SUPPORTED_AV_FILE_TYPES],
                                  accept_multiple_files=True)

if st.button("Dive in"):
    if not uploaded_files:
        st.warning("Please upload a file first.")
        st.stop()

    # Load the text from the uploaded PDF files
    global_documents = load_files_with_spinner(uploaded_files)
    global_embeddings = embed_documents_with_spinner(global_documents)
    global_keywords = extract_keywords_with_spinner(global_embeddings)
    global_summaries = generate_summary_with_spinner(global_documents)

    # Display the keywords and summaries
    for doc_name, doc_keywords in global_keywords.items():
        st.header(doc_name)
        st.caption("Keywords: " + ", ".join(doc_keywords))
        st.write(global_summaries[doc_name])
        st.divider()

    # Display the plan
    st.header("Study Plan")
    plan = generate_plan_with_spinner()
    for data in plan.get("plan", []):
        for doc_name, doc_data in data.items():
            st.subheader(doc_name)
            st.write(f"**Background Concepts**")
            for concept in doc_data.get("Background Concepts", []):
                st.write(f"- {concept}")
            st.write(f"**Key Concepts**")
            for concept in doc_data.get("Key Concepts", []):
                st.write(f"- {concept}")
            st.write(f"**Further Reading**")
            for concept in doc_data.get("Further Reading", []):
                st.write(f"- {concept}")
        st.write("")

    st.success('Summarization completed.')