Make group filtering safer for upsert searches (#517)

mdavis332 · web-flow · commit c8d4cc965da2 · 2022-10-31T12:42:29.000-04:00
diff --git a/cif/httpd/common.py b/cif/httpd/common.py
@@ -7,7 +7,7 @@
 VALID_FILTERS = {
     'indicator', 'itype', 'confidence', 'provider', 'limit', 'application', 'nolog', 'tags', 'days',
     'hours', 'groups', 'reporttime', 'cc', 'asn', 'asn_desc', 'rdata', 'firsttime', 'lasttime', 'region', 'id',
-    'portlist', 'protocol', 'tlp', 'sort', 'group'
+    'portlist', 'protocol', 'tlp', 'sort',
 }
 TOKEN_FILTERS = ['username', 'token']
 
diff --git a/cif/store/zelasticsearch/filters.py b/cif/store/zelasticsearch/filters.py
@@ -275,15 +275,7 @@ def filter_build(s, filters, token=None):
 
     s = filter_reporttime(s, q_filters)
 
-    # transform all other filters into term=
-    s = filter_terms(s, q_filters)
-
-    # indicator search/submit should mostly use singular 'group' field, but the cifsdk uses 
-    # groups (plural) for both indicators and tokens
-    if q_filters.get('group'):
-        q_filters['groups'] = q_filters.pop('group')
-        s = filter_groups(s, q_filters)
-    elif q_filters.get('groups'):
+    if q_filters.get('groups'):
         s = filter_groups(s, q_filters)
     else:
         if token and (not token.get('admin') or token.get('admin') == ''):
@@ -292,4 +284,7 @@ def filter_build(s, filters, token=None):
     if q_filters.get('tags'):
         s = filter_tags(s, q_filters)
 
+    # transform all other filters into term=
+    s = filter_terms(s, q_filters)
+
     return s
diff --git a/cif/store/zelasticsearch/indicator.py b/cif/store/zelasticsearch/indicator.py
@@ -244,23 +244,25 @@ def upsert(self, token, indicators, flush=False):
         for d in agg:
             d = agg[d]
 
+            # start assembling search filters
             filters = {'limit': 1}
             for x in UPSERT_MATCH:
                 if d.get(x):
                     if x == 'confidence':
                         filters[x] = '{},{}'.format(d[x], d[x])
+                    elif x == 'group':
+                        # indicator submit api expects 'group' (singular)
+                        # but search api expects 'groups' (plural)
+                        filters['groups'] = d[x]
+                    elif x == 'rdata':
+                        # if wildcard in rdata, don't add it to upsert search; 
+                        # urls can contain asterisks, and complex wildcard queries can 
+                        # create ES timeouts
+                        if '*' not in d['rdata']:
+                            filters[x] = d[x]
                     else:
                         filters[x] = d[x]
 
-            if d.get('tags'):
-                filters['tags'] = d['tags']
-
-            if d.get('rdata'):
-                # if wildcard in rdata, don't add it to upsert search; urls can contain asterisks, 
-                # and complex wildcard queries can create ES timeouts
-                if '*' not in d['rdata']:
-                    filters['rdata'] = d['rdata']
-
             # search for existing, return latest record
             try:
                 # search the current index only
diff --git a/test/zelasticsearch/test_store_elasticsearch_indicators_upsert.py b/test/zelasticsearch/test_store_elasticsearch_indicators_upsert.py
@@ -183,6 +183,19 @@ def indicator_diff_group():
         confidence=7.0
     )
 
+@pytest.fixture
+def indicator_diff_rdata():
+    return Indicator(
+        indicator='example.com',
+        tags='botnet',
+        provider='csirtg.io',
+        group='everyone',
+        lasttime=arrow.utcnow().datetime,
+        reporttime=arrow.utcnow().datetime,
+        confidence=7.0,
+        rdata='ns 10.1.1.1'
+    )
+
 @pytest.fixture
 def new_indicator():
     return Indicator(
@@ -510,4 +523,93 @@ def test_store_elasticsearch_indicators_upsert8(store, token, indicator, indicat
             assert i['count'] == 2
         # the indicator with group 'everyone2' should only have a count of 1
         else:
-            assert i['count'] == 1
+            assert i['count'] == 1
+
+## test duplicate indicator submission, different rdata; 
+# ensure upserts are NOT matching on diff rdata
+@pytest.mark.skipif(DISABLE_TESTS, reason='need to set CIF_ELASTICSEARCH_TEST=1 to run')
+def test_store_elasticsearch_indicators_upsert9(store, token, indicator, indicator_diff_rdata):
+
+    pprint(indicator)
+
+    indicator_dict = indicator.__dict__()
+
+    x = store.handle_indicators_create(token, indicator_dict, flush=True)
+    assert x == 1
+
+    pprint(indicator_diff_group)
+
+    indicator_rdata_dict = indicator_diff_rdata.__dict__()
+
+    y = store.handle_indicators_create(token, indicator_rdata_dict, flush=True)
+    assert y == 1
+
+    x = store.handle_indicators_search(token, {
+        'indicator': 'example.com',
+        'nolog': 1
+    })
+
+    z = json.loads(x)
+    z = [i['_source'] for i in z['hits']['hits']]
+
+    pprint(z)
+
+    assert len(z) == 2
+
+    # refresh 1st indicator times and resubmit to upsert/increase count
+    # ensure it doesn't upsert into 2nd indicator (that has the same tag but one additional)
+    indicator_dict['lasttime'] = indicator_dict['reporttime'] = arrow.utcnow().datetime
+    new_observation = Indicator(**indicator_dict)
+
+    x = store.handle_indicators_create(token, new_observation.__dict__(), flush=True)
+    assert x == 1
+
+    y = store.handle_indicators_search(token, {
+        'indicator': 'example.com',
+        'nolog': 1
+    })
+
+    z = json.loads(y)
+    z = [i['_source'] for i in z['hits']['hits']]
+
+    assert len(z) == 2 # should still have 2 indicators, but should have upserted into 1st
+
+    pprint(z)
+
+    for i in z:
+        # orig indicator (w/o rdata) should have upsert matched once for a total count of 2
+        if not i.get('rdata'):
+            assert i['count'] == 2
+        # the indicator with rdata (different) should only have a count of 1
+        else:
+            assert i['count'] == 1
+
+    # refresh 2nd indicator times and resubmit to test upsert
+    # ensure it doesn't upsert into 2nd indicator (that has the same rdata but
+    # new observation contains an asterisk which should be ignored)
+    indicator_rdata_dict['lasttime'] = indicator_rdata_dict['reporttime'] = arrow.utcnow().datetime
+    indicator_rdata_dict['rdata'] = 'some*test'
+    new_rdata_observation = Indicator(**indicator_rdata_dict)
+
+    x = store.handle_indicators_create(token, new_rdata_observation.__dict__(), flush=True)
+    assert x == 1
+
+    y = store.handle_indicators_search(token, {
+        'indicator': 'example.com',
+        'nolog': 1
+    })
+
+    z = json.loads(y)
+    z = [i['_source'] for i in z['hits']['hits']]
+
+    assert len(z) == 2 # should still have 2 indicators, but latest should have upserted into 1st
+
+    pprint(z)
+
+    for i in z:
+        # orig indicator (w/o rdata) should have upsert matched twice now for a total count of 3
+        if not i.get('rdata'):
+            assert i['count'] == 3
+        # the indicator with rdata (different) should only have a count of 1
+        else:
+            assert i['count'] == 1

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`VALID_FILTERS = {`
`8`	`8`	`'indicator', 'itype', 'confidence', 'provider', 'limit', 'application', 'nolog', 'tags', 'days',`
`9`	`9`	`'hours', 'groups', 'reporttime', 'cc', 'asn', 'asn_desc', 'rdata', 'firsttime', 'lasttime', 'region', 'id',`
`10`		`- 'portlist', 'protocol', 'tlp', 'sort', 'group'`
	`10`	`+ 'portlist', 'protocol', 'tlp', 'sort',`
`11`	`11`	`}`
`12`	`12`	`TOKEN_FILTERS = ['username', 'token']`
`13`	`13`