|
1 | 1 | import io |
2 | 2 | import logging |
3 | 3 | import uuid |
| 4 | +from concurrent.futures import ThreadPoolExecutor, as_completed |
4 | 5 | from celery import shared_task, chain, current_task, Task as CeleryTask |
5 | 6 | from django.db import transaction |
6 | 7 | import typing |
|
11 | 12 | import requests |
12 | 13 |
|
13 | 14 | from obstracts.cjob import helpers |
| 15 | +from obstracts.classifier.models import DocumentEmbedding |
| 16 | +import obstracts.classifier.tasks as classifier_tasks |
14 | 17 | from ..server.models import Job |
15 | 18 | from ..server import models |
16 | 19 | from django.core.cache import cache |
@@ -192,6 +195,99 @@ def update_vulnerabilities(job_id): |
192 | 195 | state = models.JobState.PROCESS_FAILED |
193 | 196 | job.update_state(state) |
194 | 197 |
|
| 198 | + |
| 199 | +def _build_topic_embedding_for_post(post_id, force=False): |
| 200 | + try: |
| 201 | + post_file = models.File.objects.select_related("post").get(pk=post_id) |
| 202 | + post_file.create_embedding(force=force) |
| 203 | + if post_file.embedding_id: |
| 204 | + return "processed", None |
| 205 | + return "failed", f"embedding not created for post {post_id}" |
| 206 | + except Exception: |
| 207 | + logging.exception("embedding build failed for post %s", post_id) |
| 208 | + return "failed", f"embedding build failed for post {post_id}" |
| 209 | + |
| 210 | + |
| 211 | +def run_topic_embeddings_job(job_id, force=False): |
| 212 | + job = models.Job.objects.get(pk=job_id) |
| 213 | + try: |
| 214 | + qs = models.File.objects.filter( |
| 215 | + processed=True, |
| 216 | + ai_describes_incident=True, |
| 217 | + ) |
| 218 | + if not force: |
| 219 | + qs = qs.filter(embedding__isnull=True) |
| 220 | + |
| 221 | + post_ids = list(qs.values_list("post_id", flat=True)) |
| 222 | + if not post_ids: |
| 223 | + job.update_state(models.JobState.PROCESSED) |
| 224 | + return |
| 225 | + |
| 226 | + cancelled = False |
| 227 | + |
| 228 | + with ThreadPoolExecutor(max_workers=settings.CLASSIFIER_CONCURRENCY) as pool: |
| 229 | + futures = { |
| 230 | + pool.submit(_build_topic_embedding_for_post, post_id, force): post_id |
| 231 | + for post_id in post_ids |
| 232 | + } |
| 233 | + for future in as_completed(futures): |
| 234 | + status, msg = future.result() |
| 235 | + if job.is_cancelled(): |
| 236 | + cancelled = True |
| 237 | + pool.shutdown(wait=False, cancel_futures=True) |
| 238 | + if status == "processed": |
| 239 | + job.processed_items += 1 |
| 240 | + elif status == "failed": |
| 241 | + job.failed_processes += 1 |
| 242 | + if msg: |
| 243 | + job.errors.append(msg) |
| 244 | + if cancelled: |
| 245 | + job.update_state(models.JobState.CANCELLED) |
| 246 | + elif job.failed_processes and job.processed_items == 0: |
| 247 | + job.update_state(models.JobState.PROCESS_FAILED) |
| 248 | + else: |
| 249 | + job.update_state(models.JobState.PROCESSED) |
| 250 | + except Exception as e: |
| 251 | + logging.exception("topic embedding task failed") |
| 252 | + job.failed_processes += 1 |
| 253 | + job.errors.append(str(e)) |
| 254 | + job.update_state(models.JobState.PROCESS_FAILED) |
| 255 | + finally: |
| 256 | + job.save(update_fields=["errors", "processed_items", "failed_processes"]) |
| 257 | + |
| 258 | + |
| 259 | +def run_topic_clusters_job(job_id, force=False): |
| 260 | + job = models.Job.objects.get(pk=job_id) |
| 261 | + try: |
| 262 | + if job.is_cancelled(): |
| 263 | + job.update_state(models.JobState.CANCELLED) |
| 264 | + return |
| 265 | + |
| 266 | + classifier_tasks.run_clustering( |
| 267 | + force=force, |
| 268 | + workers=settings.CLASSIFIER_CONCURRENCY, |
| 269 | + should_cancel=lambda: models.Job.objects.get(pk=job_id).is_cancelled(), |
| 270 | + ) |
| 271 | + if job.is_cancelled(): |
| 272 | + job.update_state(models.JobState.CANCELLED) |
| 273 | + return |
| 274 | + job.processed_items += 1 |
| 275 | + job.update_state(models.JobState.PROCESSED) |
| 276 | + except classifier_tasks.ClusteringCancelled: |
| 277 | + job.update_state(models.JobState.CANCELLED) |
| 278 | + except Exception as e: |
| 279 | + logging.exception("topic cluster task failed") |
| 280 | + job.failed_processes += 1 |
| 281 | + job.errors.append(str(e)) |
| 282 | + job.update_state(models.JobState.PROCESS_FAILED) |
| 283 | + finally: |
| 284 | + job.save(update_fields=["errors", "processed_items", "failed_processes"]) |
| 285 | + |
| 286 | + |
| 287 | +@shared_task |
| 288 | +def build_topic_clusters(job_id, force=False): |
| 289 | + run_topic_clusters_job(job_id, force=force) |
| 290 | + |
195 | 291 | @shared_task |
196 | 292 | def add_pdf_to_post(job_id, post_id): |
197 | 293 | job = models.Job.objects.get(pk=job_id) |
@@ -298,6 +394,7 @@ def process_post(self, job_id, post_id, profile_id=None, *args): |
298 | 394 | ) |
299 | 395 |
|
300 | 396 | file.set_txt2stix_data(processor.txt2stix_data) |
| 397 | + file.create_embedding() |
301 | 398 |
|
302 | 399 | file.processed = True |
303 | 400 | file.save( |
|
0 commit comments