From d3d648d9fd64eabc5aa6a4698deee69c30281846 Mon Sep 17 00:00:00 2001 From: JohnBaumb <80135794+JohnBaumb@users.noreply.github.com> Date: Sun, 19 Apr 2026 14:33:07 -0700 Subject: [PATCH] fix: batch metadata cache entity lookups MetadataCache.get_search_results previously looped over each cached entity ID and issued one SELECT per ID, producing N extra queries per cached search hit. It now resolves all entities in a single batched IN query (chunked at 500 to stay under the SQLite variable limit), then reconstructs the result list in the original result_ids order using an in-memory dict lookup. --- core/metadata_cache.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/core/metadata_cache.py b/core/metadata_cache.py index 50d547de..c52c91ac 100644 --- a/core/metadata_cache.py +++ b/core/metadata_cache.py @@ -326,20 +326,28 @@ class MetadataCache: """, (row['id'],)) conn.commit() - # Resolve entity IDs to full data + # Resolve entity IDs to full data via a single batched query + # (chunked to stay below SQLite's default variable limit). result_ids = json.loads(row['result_ids']) if not result_ids: return [] - results = [] - for eid in result_ids: - cursor.execute(""" - SELECT raw_json FROM metadata_cache_entities - WHERE source = ? AND entity_type = ? AND entity_id = ? - """, (source, search_type, eid)) - erow = cursor.fetchone() - if erow: - results.append(json.loads(erow['raw_json'])) + raw_by_id: Dict[str, dict] = {} + for i in range(0, len(result_ids), 500): + chunk = result_ids[i:i + 500] + placeholders = ','.join('?' * len(chunk)) + cursor.execute(f""" + SELECT entity_id, raw_json FROM metadata_cache_entities + WHERE source = ? AND entity_type = ? AND entity_id IN ({placeholders}) + """, [source, search_type, *chunk]) + for erow in cursor.fetchall(): + try: + raw_by_id[erow['entity_id']] = json.loads(erow['raw_json']) + except (ValueError, TypeError): + continue + + # Preserve the original result_ids ordering. + results = [raw_by_id[eid] for eid in result_ids if eid in raw_by_id] # Only return if we found all (or most) entries — partial results are unreliable if len(results) >= len(result_ids) * 0.8: