rabbithole/app.py at 801208c720a7f6e5a94671f669ab0683233ef1ec · punitarani/rabbithole · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Streamlit App"""

import streamlit as st
from langchain.schema import Document

from rabbithole import summarize_document
from rabbithole.embedding import embed_document
from rabbithole.keywords import get_document_keywords
from rabbithole.loader import load_file, SUPPORTED_IMG_FILE_TYPES
from rabbithole.mp3 import SUPPORTED_AV_FILE_TYPES

# Global variables
global_documents = {}
global_embeddings = {}
global_keywords = {}
global_summaries = {}


def load_files_with_spinner(files: list) -> dict[str, list[Document]]:
    """
    Load a list of files and return a list of dictionaries of Document objects.
    Display a loading animation while loading each file.
    :param files: List of files to load.
    :return: List of dictionaries of Document objects.
    """
    # Combine the results into a single dictionary
    documents = {}
    for file in files:
        with st.spinner(f'Loading {file.name}...'):
            documents[file.name] = load_file(file)
    return documents


def embed_documents_with_spinner(documents: dict[str, list[Document]]) -> dict[str, list[list[float]]]:
    """
    Embed a list of documents and return a list of dictionaries of embeddings.
    Display a loading animation while embedding each document.
    :param documents: List of documents to embed.
    :return: List of dictionaries of embeddings.
    """
    # Combine the results into a single dictionary
    embeddings = {}
    for doc_name, doc_text in documents.items():
        with st.spinner(f'Embedding {doc_name}...'):
            embeddings[doc_name] = embed_document([doc.page_content for doc in doc_text])
    return embeddings


def extract_keywords_with_spinner(embeddings: dict[str, list[list[float]]]):
    """
    Extract keywords from a list of embeddings and return a list of keywords.
    Display a loading animation while extracting each keyword.
    :param embeddings: List of embeddings to extract keywords from.
    :return: List of keywords.
    """
    # Combine the results into a single dictionary
    keywords = {}
    for doc_name, doc_embeddings in embeddings.items():
        with st.spinner(f'Extracting keywords from {doc_name}...'):
            keywords[doc_name] = get_document_keywords(doc_embeddings)
    return keywords


def generate_summary_with_spinner(documents: dict[str, list[Document]]) -> dict[str, list[list[float]]]:
    """
    Embed a list of documents and return a list of dictionaries of embeddings.
    Display a loading animation while embedding each document.
    :param documents: List of documents to embed.
    :return: List of dictionaries of embeddings.
    """
    summaries = {}
    for doc_name, doc_text in documents.items():
        with st.spinner(f'Summarizing {doc_name}...'):
            summaries[doc_name] = summarize_document(doc_text)
    return summaries


st.set_page_config(page_title="RabbitHole", page_icon="🐇", layout="wide")

st.title("RabbitHole")

uploaded_files = st.file_uploader("Upload content",
                                  type=["docx", "pdf", "txt", *SUPPORTED_IMG_FILE_TYPES, *SUPPORTED_AV_FILE_TYPES],
                                  accept_multiple_files=True)

if st.button("Dive in"):
    if not uploaded_files:
        st.warning("Please upload a file first.")
        st.stop()

    # Load the text from the uploaded PDF files
    global_documents = load_files_with_spinner(uploaded_files)
    global_embeddings = embed_documents_with_spinner(global_documents)
    global_keywords = extract_keywords_with_spinner(global_embeddings)
    global_summaries = generate_summary_with_spinner(global_documents)

    # Display the keywords and summaries
    for doc_name, doc_keywords in global_keywords.items():
        st.header(doc_name)
        st.subheader("Keywords")
        st.write(doc_keywords)
        st.subheader("Summary")
        st.write(global_summaries[doc_name])

    st.success('Summarization completed.')