muchdogesec
diff --git a/‎.env.example‎
Lines changed: 6 additions & 1 deletion b/‎.env.example‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.env.markdown‎
Lines changed: 10 additions & 1 deletion b/‎.env.markdown‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile.deploy‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.deploy‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile.test‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.test‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker-compose.yml‎
Lines changed: 9 additions & 2 deletions b/‎docker-compose.yml‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎obstracts/cjob/tasks.py‎
Lines changed: 97 additions & 0 deletions b/‎obstracts/cjob/tasks.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎obstracts/classifier/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎obstracts/classifier/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎obstracts/classifier/admin.py‎ b/‎obstracts/classifier/admin.py‎
diff --git a/‎obstracts/classifier/apps.py‎
Lines changed: 7 additions & 0 deletions b/‎obstracts/classifier/apps.py‎
Lines changed: 7 additions & 0 deletions
@@ -55,4 +55,9 @@ HISTORY4FEED_EARLIEST_SEARCH_DATE=
 HISTORY4FEED_WAYBACK_SLEEP_SECONDS=
 HISTORY4FEED_REQUEST_RETRY_COUNT=
 # pdfshift
-PDFSHIFT_API_KEY=
+PDFSHIFT_API_KEY=
+
+# clustering settings
+CLASSIFIER_MIN_CLUSTER_SIZE=
+CLASSIFIER_LABEL_SAMPLE_SIZE=
+CLASSIFIER_CONCURRENCY=
@@ -160,4 +160,13 @@ If you're not using a Proxy it is very likely you'll run into rate limits on the
 ## pdfshift
 
 * `PDFSHIFT_API_KEY`: get from `https://app.pdfshift.io/`
-	* is used to generate PDFs from posts. If you use the generate PDF setting in profile, this variable must be used.
+	* is used to generate PDFs from posts. If you use the generate PDF setting in profile, this variable must be used.
+
+
+## clustering settings
+* `CLASSIFIER_MIN_CLUSTER_SIZE`: `5-10`
+	* This is the minimum number of posts that should be in a cluster. Clusters smaller than this will be discarded. Setting this value too low may result in many small clusters that are not meaningful, while setting this value too high may result in missing out on smaller but still relevant clusters.
+* `CLASSIFIER_LABEL_SAMPLE_SIZE`: `5-20`
+	* This is the number of posts that will be sampled from each cluster to generate a label. Setting this value too low may result in less accurate labels, while setting this value too high may result in increased processing time.
+* `CLASSIFIER_CONCURRENCY`: `12`
+	* This is the number of worker threads to use for concurrent labelling of clusters. Adjust this value according to your system's capabilities and the volume of data being processed.
@@ -1,4 +1,4 @@
-FROM python:3.11-slim
+FROM python:3.11
 ENV PYTHONUNBUFFERED=1
 
 WORKDIR /usr/src/app
 
@@ -1,4 +1,4 @@
-FROM python:3.11-slim
+FROM python:3.11
 ENV PYTHONUNBUFFERED=1
 
 ARG DJANGO_DEBUG=
 
@@ -1,4 +1,4 @@
-FROM python:3.11-slim
+FROM python:3.11
 ENV PYTHONUNBUFFERED=1
 
 ARG DJANGO_DEBUG=
 
@@ -24,7 +24,6 @@ services:
             - 8001:8001
         depends_on:
             - celery
-
     celery:
         extends: env_django
         command: >
@@ -36,5 +35,13 @@ services:
                 condition: service_started
             env_django: 
                 condition: service_completed_successfully
+        environment:
+            - CLASSIFIER_MODEL_PATH=/opt/clusters/classfier_hdbscan.joblib
+        volumes:
+            - clusters:/opt/clusters/
+
     redis:
-        image: "redis:alpine"
+        image: "redis:alpine"
+
+volumes:
+    clusters:
@@ -1,6 +1,7 @@
 import io
 import logging
 import uuid
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from celery import shared_task, chain, current_task, Task as CeleryTask
 from django.db import transaction
 import typing
@@ -11,6 +12,8 @@
 import requests
 
 from obstracts.cjob import helpers
+from obstracts.classifier.models import DocumentEmbedding
+import obstracts.classifier.tasks as classifier_tasks
 from ..server.models import Job
 from ..server import models
 from django.core.cache import cache
@@ -192,6 +195,99 @@ def update_vulnerabilities(job_id):
         state = models.JobState.PROCESS_FAILED
     job.update_state(state)
 
+
+def _build_topic_embedding_for_post(post_id, force=False):
+    try:
+        post_file = models.File.objects.select_related("post").get(pk=post_id)
+        post_file.create_embedding(force=force)
+        if post_file.embedding_id:
+            return "processed", None
+        return "failed", f"embedding not created for post {post_id}"
+    except Exception:
+        logging.exception("embedding build failed for post %s", post_id)
+        return "failed", f"embedding build failed for post {post_id}"
+
+
+def run_topic_embeddings_job(job_id, force=False):
+    job = models.Job.objects.get(pk=job_id)
+    try:
+        qs = models.File.objects.filter(
+            processed=True,
+            ai_describes_incident=True,
+        )
+        if not force:
+            qs = qs.filter(embedding__isnull=True)
+
+        post_ids = list(qs.values_list("post_id", flat=True))
+        if not post_ids:
+            job.update_state(models.JobState.PROCESSED)
+            return
+
+        cancelled = False
+
+        with ThreadPoolExecutor(max_workers=settings.CLASSIFIER_CONCURRENCY) as pool:
+            futures = {
+                pool.submit(_build_topic_embedding_for_post, post_id, force): post_id
+                for post_id in post_ids
+            }
+            for future in as_completed(futures):
+                status, msg = future.result()
+                if job.is_cancelled():
+                    cancelled = True
+                    pool.shutdown(wait=False, cancel_futures=True)
+                if status == "processed":
+                    job.processed_items += 1
+                elif status == "failed":
+                    job.failed_processes += 1
+                    if msg:
+                        job.errors.append(msg)
+        if cancelled:
+            job.update_state(models.JobState.CANCELLED)
+        elif job.failed_processes and job.processed_items == 0:
+            job.update_state(models.JobState.PROCESS_FAILED)
+        else:
+            job.update_state(models.JobState.PROCESSED)
+    except Exception as e:
+        logging.exception("topic embedding task failed")
+        job.failed_processes += 1
+        job.errors.append(str(e))
+        job.update_state(models.JobState.PROCESS_FAILED)
+    finally:
+        job.save(update_fields=["errors", "processed_items", "failed_processes"])
+
+
+def run_topic_clusters_job(job_id, force=False):
+    job = models.Job.objects.get(pk=job_id)
+    try:
+        if job.is_cancelled():
+            job.update_state(models.JobState.CANCELLED)
+            return
+
+        classifier_tasks.run_clustering(
+            force=force,
+            workers=settings.CLASSIFIER_CONCURRENCY,
+            should_cancel=lambda: models.Job.objects.get(pk=job_id).is_cancelled(),
+        )
+        if job.is_cancelled():
+            job.update_state(models.JobState.CANCELLED)
+            return
+        job.processed_items += 1
+        job.update_state(models.JobState.PROCESSED)
+    except classifier_tasks.ClusteringCancelled:
+        job.update_state(models.JobState.CANCELLED)
+    except Exception as e:
+        logging.exception("topic cluster task failed")
+        job.failed_processes += 1
+        job.errors.append(str(e))
+        job.update_state(models.JobState.PROCESS_FAILED)
+    finally:
+        job.save(update_fields=["errors", "processed_items", "failed_processes"])
+
+
+@shared_task
+def build_topic_clusters(job_id, force=False):
+    run_topic_clusters_job(job_id, force=force)
+
 @shared_task
 def add_pdf_to_post(job_id, post_id):
     job = models.Job.objects.get(pk=job_id)
@@ -298,6 +394,7 @@ def process_post(self, job_id, post_id, profile_id=None, *args):
                 )
 
         file.set_txt2stix_data(processor.txt2stix_data)
+        file.create_embedding()
 
         file.processed = True
         file.save(
 
@@ -0,0 +1 @@
+default_app_config = 'obstracts.classifier.apps.ClassifierConfig'
@@ -0,0 +1,7 @@
+from django.apps import AppConfig
+
+
+class ClassifierConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "obstracts.classifier"
+    verbose_name = "Classifier"
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM python:3.11-slim`
	`1`	`+FROM python:3.11`
`2`	`2`	`ENV PYTHONUNBUFFERED=1`
`3`	`3`
`4`	`4`	`WORKDIR /usr/src/app`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+default_app_config = 'obstracts.classifier.apps.ClassifierConfig'`