Add keyword generation and improve UI

punitarani · punitarani · commit 15799b5495d4 · 2023-05-14T00:07:31.000-07:00
diff --git a/app.py b/app.py
@@ -4,11 +4,16 @@
 from langchain.schema import Document
 
 from rabbithole import summarize_document
+from rabbithole.embedding import embed_document
+from rabbithole.keywords import get_document_keywords
 from rabbithole.loader import load_file, SUPPORTED_IMG_FILE_TYPES
 from rabbithole.mp3 import SUPPORTED_AV_FILE_TYPES
 
 # Global variables
-results = {}
+global_documents = {}
+global_embeddings = {}
+global_keywords = {}
+global_summaries = {}
 
 
 def load_files_with_spinner(files: list) -> dict[str, list[Document]]:
@@ -26,12 +31,48 @@ def load_files_with_spinner(files: list) -> dict[str, list[Document]]:
     return documents
 
 
-def run_summarization(document: list[Document], doc_name: str):
-    """Execute the text summarization"""
-    with st.spinner(f'Summarizing {doc_name}...'):
-        summary = summarize_document(document[:2])
-        results[doc_name] = summary
-        st.write(f"'{doc_name}' Summary:\n{summary}")
+def embed_documents_with_spinner(documents: dict[str, list[Document]]) -> dict[str, list[list[float]]]:
+    """
+    Embed a list of documents and return a list of dictionaries of embeddings.
+    Display a loading animation while embedding each document.
+    :param documents: List of documents to embed.
+    :return: List of dictionaries of embeddings.
+    """
+    # Combine the results into a single dictionary
+    embeddings = {}
+    for doc_name, doc_text in documents.items():
+        with st.spinner(f'Embedding {doc_name}...'):
+            embeddings[doc_name] = embed_document([doc.page_content for doc in doc_text])
+    return embeddings
+
+
+def extract_keywords_with_spinner(embeddings: dict[str, list[list[float]]]):
+    """
+    Extract keywords from a list of embeddings and return a list of keywords.
+    Display a loading animation while extracting each keyword.
+    :param embeddings: List of embeddings to extract keywords from.
+    :return: List of keywords.
+    """
+    # Combine the results into a single dictionary
+    keywords = {}
+    for doc_name, doc_embeddings in embeddings.items():
+        with st.spinner(f'Extracting keywords from {doc_name}...'):
+            keywords[doc_name] = get_document_keywords(doc_embeddings)
+    return keywords
+
+
+def generate_summary_with_spinner(documents: dict[str, list[Document]]) -> dict[str, list[list[float]]]:
+    """
+    Embed a list of documents and return a list of dictionaries of embeddings.
+    Display a loading animation while embedding each document.
+    :param documents: List of documents to embed.
+    :return: List of dictionaries of embeddings.
+    """
+    summaries = {}
+    for doc_name, doc_text in documents.items():
+        with st.spinner(f'Summarizing {doc_name}...'):
+            summaries[doc_name] = summarize_document(doc_text[:2])
+    return summaries
 
 
 st.title("RabbitHole")
@@ -46,10 +87,17 @@ def run_summarization(document: list[Document], doc_name: str):
         st.stop()
 
     # Load the text from the uploaded PDF files
-    texts = load_files_with_spinner(uploaded_files)
+    global_documents = load_files_with_spinner(uploaded_files)
+    global_embeddings = embed_documents_with_spinner(global_documents)
+    global_keywords = extract_keywords_with_spinner(global_embeddings)
+    global_summaries = generate_summary_with_spinner(global_documents)
 
-    # Run the summarization for each document
-    for doc_name, doc_text in texts.items():
-        run_summarization(doc_text, doc_name)
+    # Display the keywords and summaries
+    for doc_name, doc_keywords in global_keywords.items():
+        st.header(doc_name)
+        st.subheader("Keywords")
+        st.write(doc_keywords)
+        st.subheader("Summary")
+        st.write(global_summaries[doc_name])
 
     st.success('Summarization completed.')