From 90b2d6717b20e93b7a09769c85459d6aec557abb Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Tue, 2 Jun 2026 17:19:53 -0400
Subject: [PATCH 1/2] Add exact match mode and parallelize bulk-lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an `exact` parameter (label/synonyms/any) to /lookup and /bulk-lookup.
When set, the eDisMax query is bypassed entirely in favour of a Solr filter
query against the *_exactish fields (KeywordTokenizer + LowerCaseFilter).
Filter queries are cached by Solr, so repeated exact lookups of the same term
are fast after the first hit — the intended use case is NER pipelines doing
bulk exact-string lookups.

Also switches bulk_lookup() from a sequential for-loop to asyncio.gather(),
sending all N Solr requests concurrently instead of one at a time.

Closes #258

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 api/server.py | 125 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 90 insertions(+), 35 deletions(-)

diff --git a/api/server.py b/api/server.py
index 8fc60a97..e4905b31 100755
--- a/api/server.py
+++ b/api/server.py
@@ -7,6 +7,7 @@
   * The curie with the shortest match is first, etc.
   * Matching names are returned first, followed by non-matching names
 """
+import asyncio
 import json
 import logging
 import statistics
@@ -15,6 +16,7 @@
 import os
 import re
 from collections import deque
+from enum import Enum
 from typing import Dict, List, Union, Annotated, Optional
 
 from fastapi import Body, FastAPI, Query
@@ -330,6 +332,13 @@ async def curie_lookup(curies) -> Dict[str, Dict]:
 
     return output
 
+class ExactMatchMode(str, Enum):
+    """Controls exact-match behaviour in lookup queries."""
+    label    = "label"     # match against preferred_name_exactish only
+    synonyms = "synonyms"  # match against names_exactish only
+    any      = "any"       # match against either
+
+
 class LookupResult(BaseModel):
     curie:str
     label: str
@@ -392,12 +401,17 @@ async def lookup_curies_get(
                         "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
             # We can't use `example` here because otherwise it gets filled in when filling this in.
             # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
-        )] = None
+        )] = None,
+        exact: Annotated[Optional[ExactMatchMode], Query(
+            description="Exact-match mode: 'label' matches the preferred name only, "
+                        "'synonyms' matches any synonym, 'any' matches either. "
+                        "Omit for the default fuzzy search."
+        )] = None,
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
     """
-    return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
+    return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, exact)
 
 
 @app.post("/lookup",
@@ -451,12 +465,17 @@ async def lookup_curies_post(
                         "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
             # We can't use `example` here because otherwise it gets filled in when filling this in.
             # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
-        )] = None
+        )] = None,
+        exact: Annotated[Optional[ExactMatchMode], Query(
+            description="Exact-match mode: 'label' matches the preferred name only, "
+                        "'synonyms' matches any synonym, 'any' matches either. "
+                        "Omit for the default fuzzy search."
+        )] = None,
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
     """
-    return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
+    return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, exact)
 
 
 async def lookup(string: str,
@@ -467,7 +486,8 @@ async def lookup(string: str,
            biolink_types: List[str] = None,
            only_prefixes: str = "",
            exclude_prefixes: str = "",
-           only_taxa: str = ""
+           only_taxa: str = "",
+           exact: Optional[ExactMatchMode] = None,
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
@@ -564,32 +584,54 @@ async def lookup(string: str,
             # "hl.highlightMultiTerm": "true",
         })
 
-    params = {
-        "query": {
-            "edismax": {
-                "query": query,
-                # qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input.
-                # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter
-                "qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10",
-                # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase.
-                # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter
-                "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20",
-                # Boosts
-                "bq": [],
-                "boost": [
-                    # The boost is multiplied with score -- calculating the log() reduces how quickly this increases
-                    # the score for increasing clique identifier counts.
-                    "log(sum(clique_identifier_count, 1))"
-                ],
+    if exact:
+        # Exact mode: bypass eDisMax entirely and use a filter query against the *_exactish fields.
+        # Filter queries are cached by Solr, making repeated lookups of the same term very fast.
+        string_lc_escaped = string_lc.replace('\\', '\\\\').replace('"', '\\"')
+        if exact == ExactMatchMode.label:
+            filters.append(f'preferred_name_exactish:"{string_lc_escaped}"')
+        elif exact == ExactMatchMode.synonyms:
+            filters.append(f'names_exactish:"{string_lc_escaped}"')
+        else:  # ExactMatchMode.any
+            filters.append(
+                f'(preferred_name_exactish:"{string_lc_escaped}" OR names_exactish:"{string_lc_escaped}")'
+            )
+        params = {
+            "query": "*:*",
+            "filter": filters,
+            "sort": "clique_identifier_count DESC, curie_suffix ASC",
+            "limit": limit,
+            "offset": offset,
+            "fields": "*, score",
+            "params": inner_params,
+        }
+    else:
+        params = {
+            "query": {
+                "edismax": {
+                    "query": query,
+                    # qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input.
+                    # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter
+                    "qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10",
+                    # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase.
+                    # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter
+                    "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20",
+                    # Boosts
+                    "bq": [],
+                    "boost": [
+                        # The boost is multiplied with score -- calculating the log() reduces how quickly this increases
+                        # the score for increasing clique identifier counts.
+                        "log(sum(clique_identifier_count, 1))"
+                    ],
+                },
             },
-        },
-        "sort": "score DESC, clique_identifier_count DESC, curie_suffix ASC",
-        "limit": limit,
-        "offset": offset,
-        "filter": filters,
-        "fields": "*, score",
-        "params": inner_params,
-    }
+            "sort": "score DESC, clique_identifier_count DESC, curie_suffix ASC",
+            "limit": limit,
+            "offset": offset,
+            "filter": filters,
+            "fields": "*, score",
+            "params": inner_params,
+        }
     logger.debug(f"Query: {json.dumps(params, indent=2)}")
 
     time_solr_start = time.perf_counter_ns()
@@ -652,7 +694,7 @@ async def lookup(string: str,
         f"Lookup query to Solr for {json.dumps(string)} "
         f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, "
         f"biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, "
-        f"only_taxa={only_taxa}) "
+        f"only_taxa={only_taxa}, exact={exact}) "
         f"took {time_taken_ms:.2f}ms (with {solr_ms:.2f}ms waiting for Solr)"
     )
     if time_taken_ms > SLOW_QUERY_THRESHOLD_MS:
@@ -719,6 +761,12 @@ class NameResQuery(BaseModel):
         # We can't use `example` here because otherwise it gets filled in when filling this in.
         # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
     )
+    exact: Optional[ExactMatchMode] = Field(
+        None,
+        description="Exact-match mode: 'label' matches the preferred name only, "
+                    "'synonyms' matches any synonym, 'any' matches either. "
+                    "Omit (or null) for the default fuzzy search.",
+    )
 
 
 @app.post("/bulk-lookup",
@@ -729,9 +777,9 @@ class NameResQuery(BaseModel):
 )
 async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]:
     time_start = time.perf_counter_ns()
-    result = {}
-    for string in query.strings:
-        result[string] = await lookup(
+
+    async def do_lookup(string: str):
+        results = await lookup(
             string,
             query.autocomplete,
             query.highlighting,
@@ -740,7 +788,14 @@ async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]:
             query.biolink_types,
             query.only_prefixes,
             query.exclude_prefixes,
-            query.only_taxa)
+            query.only_taxa,
+            query.exact,
+        )
+        return string, results
+
+    pairs = await asyncio.gather(*[do_lookup(s) for s in query.strings])
+    result = dict(pairs)
+
     time_end = time.perf_counter_ns()
     logger.info(f"Bulk lookup query for {len(query.strings)} strings ({query}) took {(time_end - time_start)/1_000_000:.2f}ms")
 

From 1387e38f0d78cca6d67368c58163b88c386e69b9 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Tue, 2 Jun 2026 17:20:04 -0400
Subject: [PATCH 2/2] Add tests for exact match mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Uses HP:0001300 (preferred_name="parkinsonian disorder", names=["Parkinsonian
disease"]) as the test case, since its preferred name is absent from the names
list — cleanly separating label-only vs. synonym-only matching.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/test_service.py | 69 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/tests/test_service.py b/tests/test_service.py
index d5522d7f..3779618c 100644
--- a/tests/test_service.py
+++ b/tests/test_service.py
@@ -223,3 +223,72 @@ def test_synonyms():
     mondo_0000828_results = results['MONDO:0000828']
     assert mondo_0000828_results['curie'] == 'MONDO:0000828'
     assert mondo_0000828_results['preferred_name'] == 'juvenile-onset Parkinson disease'
+
+
+# HP:0001300 has preferred_name="parkinsonian disorder" and names=["Parkinsonian disease"].
+# The preferred_name is NOT in names, making it a good test case for label vs. synonyms exact mode.
+
+def test_exact_label_match():
+    client = TestClient(app)
+    # "parkinsonian disorder" is the preferred label for HP:0001300 — label mode should find it.
+    response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'label', 'limit': 100})
+    results = response.json()
+    curies = [r['curie'] for r in results]
+    assert 'HP:0001300' in curies
+
+    # "Parkinsonian disease" is only a synonym (names entry), not the preferred label — label mode should NOT find it.
+    response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'label', 'limit': 100})
+    results = response.json()
+    curies = [r['curie'] for r in results]
+    assert 'HP:0001300' not in curies
+
+
+def test_exact_synonyms_match():
+    client = TestClient(app)
+    # "Parkinsonian disease" is a synonym (names entry) for HP:0001300 — synonyms mode should find it.
+    response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'synonyms', 'limit': 100})
+    results = response.json()
+    curies = [r['curie'] for r in results]
+    assert 'HP:0001300' in curies
+
+    # "parkinsonian disorder" is the preferred_name but NOT in names for HP:0001300 — synonyms mode should NOT find it.
+    response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'synonyms', 'limit': 100})
+    results = response.json()
+    curies = [r['curie'] for r in results]
+    assert 'HP:0001300' not in curies
+
+
+def test_exact_any_match():
+    client = TestClient(app)
+    # exact=any should match on either the preferred label or a synonym.
+    response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'any', 'limit': 100})
+    curies = [r['curie'] for r in response.json()]
+    assert 'HP:0001300' in curies
+
+    response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'any', 'limit': 100})
+    curies = [r['curie'] for r in response.json()]
+    assert 'HP:0001300' in curies
+
+
+def test_exact_no_partial_match():
+    client = TestClient(app)
+    # "parkinson" is only a substring of known terms — exact mode must return no match for HP:0001300.
+    response = client.post("/lookup", params={'string': 'parkinson', 'exact': 'any', 'limit': 100})
+    curies = [r['curie'] for r in response.json()]
+    assert 'HP:0001300' not in curies
+
+
+def test_exact_bulk_lookup():
+    client = TestClient(app)
+    params = {
+        'strings': ['parkinsonian disorder', 'Parkinsonian disease', 'no match term xyz'],
+        'exact': 'any',
+        'limit': 10,
+    }
+    response = client.post("/bulk-lookup", json=params)
+    results = response.json()
+
+    assert set(results.keys()) == {'parkinsonian disorder', 'Parkinsonian disease', 'no match term xyz'}
+    assert 'HP:0001300' in [r['curie'] for r in results['parkinsonian disorder']]
+    assert 'HP:0001300' in [r['curie'] for r in results['Parkinsonian disease']]
+    assert results['no match term xyz'] == []