Skip to content

Commit c26f3d4

Browse files
authored
fix slow query (#477)
* use managed `is_dupe` field to filter instead of window_agg for deduplication * fix failing tests * add sort param on every index * add more indexes
1 parent 10eb3af commit c26f3d4

7 files changed

Lines changed: 246 additions & 49 deletions

File tree

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Generated by Django 5.2.11 on 2026-04-02 14:16
2+
3+
from django.db import migrations, models
4+
5+
class Migration(migrations.Migration):
6+
7+
dependencies = [
8+
('obstracts', '0020_objectvalue_obstracts_s_stix_id_modified_and_more'),
9+
]
10+
11+
operations = [
12+
migrations.AddField(
13+
model_name='objectvalue',
14+
name='is_dupe',
15+
field=models.BooleanField(db_index=True, default=False),
16+
),
17+
18+
migrations.AddField(
19+
model_name='objectvalue',
20+
name='value',
21+
field=models.CharField(blank=True, max_length=64, null=True),
22+
),
23+
]
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Generated by Django 5.2.11 on 2026-04-03 13:46
2+
3+
import django.contrib.postgres.fields
4+
import django.db.models.fields.json
5+
import django.db.models.functions.text
6+
from django.db import migrations, models
7+
8+
9+
class Migration(migrations.Migration):
10+
11+
dependencies = [
12+
('obstracts', '0021_objectvalue_is_dupe'),
13+
]
14+
15+
operations = [
16+
migrations.RemoveField(
17+
model_name='objectvalue',
18+
name='value',
19+
),
20+
migrations.RemoveIndex(
21+
model_name='objectvalue',
22+
name='obstracts_s_stix_id_type_idx',
23+
),
24+
migrations.RemoveIndex(
25+
model_name='objectvalue',
26+
name='obstracts_s_stix_id_modified',
27+
),
28+
migrations.RemoveIndex(
29+
model_name='objectvalue',
30+
name='obstracts_ov_type_stix_idx',
31+
),
32+
migrations.RemoveIndex(
33+
model_name='objectvalue',
34+
name='obstracts_ov_kbase_stix_idx',
35+
),
36+
migrations.RemoveIndex(
37+
model_name='objectvalue',
38+
name='obstracts_ov_first_value_idx',
39+
),
40+
migrations.AddField(
41+
model_name='objectvalue',
42+
name='values_concat',
43+
field=models.GeneratedField(db_persist=True, expression=models.Func(models.F('values'), function='jsonb_values_concat'), null=True, output_field=models.TextField()),
44+
),
45+
migrations.AddField(
46+
model_name='objectvalue',
47+
name='values_list',
48+
field=models.GeneratedField(db_persist=True, expression=models.Func(models.F('values'), function='jsonb_values_list'), null=True, output_field=django.contrib.postgres.fields.ArrayField(base_field=models.TextField(), size=None)),
49+
),
50+
migrations.AlterField(
51+
model_name='objectvalue',
52+
name='is_dupe',
53+
field=models.BooleanField(default=False),
54+
),
55+
migrations.AlterField(
56+
model_name='objectvalue',
57+
name='knowledgebase',
58+
field=models.CharField(blank=True, max_length=64, null=True),
59+
),
60+
migrations.AlterField(
61+
model_name='objectvalue',
62+
name='type',
63+
field=models.CharField(max_length=256),
64+
),
65+
migrations.AddIndex(
66+
model_name='objectvalue',
67+
index=models.Index(condition=models.Q(('is_dupe', False)), fields=['type', 'stix_id'], name='obstracts_ov_type_stix_idx'),
68+
),
69+
migrations.AddIndex(
70+
model_name='objectvalue',
71+
index=models.Index(condition=models.Q(('is_dupe', False)), fields=['created', 'knowledgebase'], name='obstracts_ov_kbase_c_idx'),
72+
),
73+
migrations.AddIndex(
74+
model_name='objectvalue',
75+
index=models.Index(condition=models.Q(('is_dupe', False)), fields=['modified', 'knowledgebase'], name='obstracts_ov_kbase_m_idx'),
76+
),
77+
migrations.AddIndex(
78+
model_name='objectvalue',
79+
index=models.Index(condition=models.Q(('is_dupe', False)), fields=['created', 'type'], name='obstracts_ov_created_type_idx'),
80+
),
81+
migrations.AddIndex(
82+
model_name='objectvalue',
83+
index=models.Index(condition=models.Q(('is_dupe', False)), fields=['modified', 'type'], name='obstracts_ov_modified_type_idx'),
84+
),
85+
migrations.AddIndex(
86+
model_name='objectvalue',
87+
index=models.Index(django.db.models.fields.json.KeyTextTransform('kb_type', 'values'), models.F('type'), condition=models.Q(('is_dupe', False)), name='obstracts_ov_kb_type_idx'),
88+
),
89+
migrations.AddIndex(
90+
model_name='objectvalue',
91+
index=models.Index(models.F('created'), django.db.models.functions.text.Upper(django.db.models.fields.json.KeyTextTransform('kb_id', 'values')), models.F('type'), condition=models.Q(('is_dupe', False)), name='obstracts_ov_kb_id_cidx'),
92+
),
93+
migrations.AddIndex(
94+
model_name='objectvalue',
95+
index=models.Index(models.F('modified'), django.db.models.functions.text.Upper(django.db.models.fields.json.KeyTextTransform('kb_id', 'values')), models.F('type'), condition=models.Q(('is_dupe', False)), name='obstracts_ov_kb_id_midx'),
96+
),
97+
migrations.AddIndex(
98+
model_name='objectvalue',
99+
index=models.Index(models.F('values_concat'), models.F('type'), condition=models.Q(('is_dupe', False)), name='obstracts_ov_values_concat_idx'),
100+
),
101+
migrations.AddIndex(
102+
model_name='objectvalue',
103+
index=models.Index(models.F('values_concat'), models.F('knowledgebase'), condition=models.Q(('is_dupe', False)), name='obstracts_ov_values_c_kbidx'),
104+
),
105+
]
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Generated by Django 5.2.11 on 2026-04-02 14:16
2+
3+
from django.db import migrations, models
4+
5+
class Migration(migrations.Migration):
6+
7+
dependencies = [
8+
('obstracts', '0022_remove_objectvalue_obstracts_s_stix_id_type_idx_and_more'),
9+
]
10+
11+
operations = [
12+
migrations.RunSQL(
13+
"""
14+
DROP INDEX IF EXISTS ctx_values_list_idx;
15+
CREATE INDEX ctx_values_list_idx
16+
ON obstracts_objectvalue
17+
USING gin (values_list array_ops, type gin_trgm_ops) WHERE NOT is_dupe;
18+
""",
19+
reverse_sql=migrations.RunSQL.noop,
20+
),
21+
22+
migrations.RunSQL(
23+
sql="""
24+
DROP INDEX IF EXISTS ctx_values_concat_idx;
25+
CREATE INDEX ctx_values_concat_idx
26+
ON obstracts_objectvalue
27+
USING gin (values_concat gin_trgm_ops, type gin_trgm_ops) WHERE NOT is_dupe;
28+
""",
29+
reverse_sql=migrations.RunSQL.noop,
30+
),
31+
]

obstracts/server/models.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import typing
77
from django.conf import settings
88
from django.db import models
9+
from django.db.models.fields.json import KeyTextTransform
10+
from django.db.models.functions import Upper
911
from django.utils.text import slugify
1012
from pgvector.django import CosineDistance
1113
import txt2stix, txt2stix.extractions
@@ -29,6 +31,7 @@
2931
from obstracts.classifier.models import Cluster, DocumentEmbedding
3032
from obstracts.classifier.tasks import create_embedding_text, compute_embedding_for_document
3133
from obstracts.server.values.filters import DictFirstValue
34+
from django.contrib.postgres import indexes as pg_indexes
3235

3336
# Create your models here.
3437
if typing.TYPE_CHECKING:
@@ -380,27 +383,43 @@ class ObjectValue(models.Model):
380383
Stores extracted values from STIX objects for efficient querying and filtering.
381384
"""
382385
stix_id = models.CharField(max_length=256, db_index=True)
383-
type = models.CharField(max_length=256, db_index=True)
384-
knowledgebase = models.CharField(max_length=64, null=True, blank=True, db_index=True)
386+
type = models.CharField(max_length=256)
387+
knowledgebase = models.CharField(max_length=64, null=True, blank=True)
385388
values = models.JSONField()
386389
file = models.ForeignKey(File, on_delete=models.CASCADE, related_name='object_values')
387390
created = models.DateTimeField(default=None, null=True)
388391
modified = models.DateTimeField(default=None, null=True)
392+
is_dupe = models.BooleanField(default=False)
393+
values_concat = models.GeneratedField(
394+
expression=models.Func(models.F("values"), function="jsonb_values_concat"),
395+
output_field=models.TextField(),
396+
db_persist=True,
397+
null=True, blank=True,
398+
)
399+
values_list = models.GeneratedField(
400+
expression=models.Func(models.F("values"), function="jsonb_values_list"),
401+
output_field=ArrayField(base_field=models.TextField()),
402+
db_persist=True, null=True, blank=True,
403+
)
389404

390405
class Meta:
391406
indexes = [
392-
models.Index(fields=['stix_id', 'type'], name='obstracts_s_stix_id_type_idx'),
393-
models.Index(fields=['stix_id', 'modified'], name='obstracts_s_stix_id_modified'),
394-
models.Index(fields=['type', 'stix_id'], name='obstracts_ov_type_stix_idx'),
395-
models.Index(fields=['knowledgebase', 'stix_id'], name='obstracts_ov_kbase_stix_idx'),
396-
models.Index(DictFirstValue('values'), name='obstracts_ov_first_value_idx'),
407+
models.Index(fields=['type', 'stix_id'], name='obstracts_ov_type_stix_idx', condition=models.Q(is_dupe=False)),
408+
models.Index(fields=['created', 'knowledgebase'], name='obstracts_ov_kbase_c_idx', condition=models.Q(is_dupe=False)),
409+
models.Index(fields=['modified', 'knowledgebase'], name='obstracts_ov_kbase_m_idx', condition=models.Q(is_dupe=False)),
410+
models.Index(fields=['created', 'type'], name='obstracts_ov_created_type_idx', condition=models.Q(is_dupe=False)),
411+
models.Index(fields=['modified', 'type'], name='obstracts_ov_modified_type_idx', condition=models.Q(is_dupe=False)),
412+
models.Index(KeyTextTransform('kb_type', 'values'), 'type', name='obstracts_ov_kb_type_idx', condition=models.Q(is_dupe=False)),
413+
models.Index('created', Upper(KeyTextTransform('kb_id', 'values')), 'type', name='obstracts_ov_kb_id_cidx', condition=models.Q(is_dupe=False)),
414+
models.Index('modified', Upper(KeyTextTransform('kb_id', 'values')), 'type', name='obstracts_ov_kb_id_midx', condition=models.Q(is_dupe=False)),
415+
models.Index('values_concat', 'type', name='obstracts_ov_values_concat_idx', condition=models.Q(is_dupe=False)),
416+
models.Index('values_concat', 'knowledgebase', name='obstracts_ov_values_c_kbidx', condition=models.Q(is_dupe=False)),
397417
]
398418
unique_together = [['stix_id', 'file']]
399419

400420
def __str__(self):
401421
return f'ObjectValue(stix_id={self.stix_id}, knowledgebase={self.knowledgebase})'
402422

403-
404423
class Job(models.Model):
405424
id = models.UUIDField(primary_key=True, editable=False)
406425
history4feed_job = models.OneToOneField(

obstracts/server/values/values.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -289,19 +289,26 @@ def process_uploaded_objects_hook(instance, collection_name, objects, **kwargs):
289289
object_values_to_create.append(
290290
ObjectValue(
291291
file_id=post_uuid,
292-
**metadata
292+
**metadata,
293+
is_dupe=False, # will be updated later in a deduplication step
293294
)
294295
)
295296

296297
# Bulk create with ignore_conflicts to handle duplicates
297298
if object_values_to_create:
298-
created_count = len(
299-
ObjectValue.objects.bulk_create(
300-
object_values_to_create, ignore_conflicts=True
301-
)
299+
created = ObjectValue.objects.bulk_create(
300+
object_values_to_create, ignore_conflicts=True
301+
)
302+
new_dupes = ObjectValue.objects.filter(
303+
stix_id__in=[obj.stix_id for obj in created],
304+
is_dupe=False,
305+
).exclude(
306+
file_id__in=[obj.file_id for obj in created],
302307
)
308+
new_dupes.update(is_dupe=True)
303309
logging.info(
304-
f"Created {created_count} ObjectValue records for {len(object_values_to_create)} objects"
310+
f"Created {len(created)} ObjectValue records for {len(object_values_to_create)} objects"
305311
)
312+
logging.info(f"Marked {new_dupes.count()} ObjectValue records as duplicates")
306313
else:
307314
logging.info("No ObjectValue records to create")

obstracts/server/values/views.py

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,9 @@ def filter_value(self, queryset, name, value):
8383
value_exact = self.data.get("value_exact", "false").lower() == "true"
8484

8585
if value_exact:
86-
return queryset.filter(values__jsonb_vexact=value)
86+
return queryset.filter(values_list__contains=[value.lower()])
8787
else:
88-
return queryset.filter(values__jsonb_vcontains=value)
88+
return queryset.filter(values_concat__icontains=value.lower())
8989

9090
def filter_noop(self, queryset, name, value):
9191
"""
@@ -109,9 +109,12 @@ class BaseObjectValueView(mixins.ListModelMixin, viewsets.GenericViewSet):
109109
queryset = ObjectValue.objects.all()
110110
serializer_class = ObjectValueSerializer
111111
pagination_class = Pagination("values")
112-
filter_backends = [DjangoFilterBackend, Ordering]
112+
filter_backends = [
113+
DjangoFilterBackend,
114+
Ordering
115+
]
113116
filterset_class = ObjectValueFilterSet
114-
ordering_fields = ["stix_id", "type", "knowledgebase", "value"]
117+
ordering_fields = ["value"]
115118
ordering = "stix_id_descending"
116119
openapi_tags = ["Object Values"]
117120

@@ -126,21 +129,12 @@ def get_queryset(self):
126129
queryset = queryset.filter(type__in=self.allowed_types)
127130

128131

129-
from django.db.models import F, Window
130-
from django.db.models.functions import RowNumber
131-
# Aggregate all post_ids for each unique stix_id
132-
queryset = queryset.annotate(
133-
rn=Window(
134-
expression=RowNumber(),
135-
partition_by=[F("stix_id")],
136-
order_by=F("stix_id").desc(),
137-
)
138-
).filter(
139-
rn=1
140-
).annotate(
141-
value=DictFirstValue(F("values")),
142-
)
143132

133+
from django.db.models import F
134+
queryset = queryset.alias(value=F('values_concat'))
135+
queryset = queryset.filter(
136+
is_dupe=False
137+
)
144138
return queryset
145139

146140

@@ -181,7 +175,7 @@ class SCOValueView(BaseObjectValueView):
181175
"""View for STIX Cyber Observable Objects (SCOs) only."""
182176

183177
allowed_types = list(sco_value_map.keys())
184-
ordering_fields = ["value", "stix_id", "type"]
178+
ordering_fields = ["value"]
185179
ordering = "value_ascending"
186180

187181
class filterset_class(ObjectValueFilterSet):
@@ -231,7 +225,8 @@ class SDOValueView(BaseObjectValueView):
231225
"""View for STIX Domain Objects (SDOs) only."""
232226

233227
allowed_types = list(sdo_value_map.keys())
234-
ordering_fields = ["stix_id", "type", "knowledgebase", "value", "created", "modified"]
228+
ordering_fields = ["value", "created", "modified"]
229+
ordering = "modified_descending"
235230

236231
class filterset_class(ObjectValueFilterSet):
237232
knowledgebases = ChoiceCSVFilter(

0 commit comments

Comments
 (0)