From 90b2d6717b20e93b7a09769c85459d6aec557abb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 2 Jun 2026 17:19:53 -0400 Subject: [PATCH 1/2] Add exact match mode and parallelize bulk-lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an `exact` parameter (label/synonyms/any) to /lookup and /bulk-lookup. When set, the eDisMax query is bypassed entirely in favour of a Solr filter query against the *_exactish fields (KeywordTokenizer + LowerCaseFilter). Filter queries are cached by Solr, so repeated exact lookups of the same term are fast after the first hit — the intended use case is NER pipelines doing bulk exact-string lookups. Also switches bulk_lookup() from a sequential for-loop to asyncio.gather(), sending all N Solr requests concurrently instead of one at a time. Closes #258 Co-Authored-By: Claude Sonnet 4.6 --- api/server.py | 125 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 90 insertions(+), 35 deletions(-) diff --git a/api/server.py b/api/server.py index 8fc60a97..e4905b31 100755 --- a/api/server.py +++ b/api/server.py @@ -7,6 +7,7 @@ * The curie with the shortest match is first, etc. * Matching names are returned first, followed by non-matching names """ +import asyncio import json import logging import statistics @@ -15,6 +16,7 @@ import os import re from collections import deque +from enum import Enum from typing import Dict, List, Union, Annotated, Optional from fastapi import Body, FastAPI, Query @@ -330,6 +332,13 @@ async def curie_lookup(curies) -> Dict[str, Dict]: return output +class ExactMatchMode(str, Enum): + """Controls exact-match behaviour in lookup queries.""" + label = "label" # match against preferred_name_exactish only + synonyms = "synonyms" # match against names_exactish only + any = "any" # match against either + + class LookupResult(BaseModel): curie:str label: str @@ -392,12 +401,17 @@ async def lookup_curies_get( "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.", # We can't use `example` here because otherwise it gets filled in when filling this in. # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955" - )] = None + )] = None, + exact: Annotated[Optional[ExactMatchMode], Query( + description="Exact-match mode: 'label' matches the preferred name only, " + "'synonyms' matches any synonym, 'any' matches either. " + "Omit for the default fuzzy search." + )] = None, ) -> List[LookupResult]: """ Returns cliques with a name or synonym that contains a specified string. """ - return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa) + return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, exact) @app.post("/lookup", @@ -451,12 +465,17 @@ async def lookup_curies_post( "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.", # We can't use `example` here because otherwise it gets filled in when filling this in. # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955" - )] = None + )] = None, + exact: Annotated[Optional[ExactMatchMode], Query( + description="Exact-match mode: 'label' matches the preferred name only, " + "'synonyms' matches any synonym, 'any' matches either. " + "Omit for the default fuzzy search." + )] = None, ) -> List[LookupResult]: """ Returns cliques with a name or synonym that contains a specified string. """ - return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa) + return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, exact) async def lookup(string: str, @@ -467,7 +486,8 @@ async def lookup(string: str, biolink_types: List[str] = None, only_prefixes: str = "", exclude_prefixes: str = "", - only_taxa: str = "" + only_taxa: str = "", + exact: Optional[ExactMatchMode] = None, ) -> List[LookupResult]: """ Returns cliques with a name or synonym that contains a specified string. @@ -564,32 +584,54 @@ async def lookup(string: str, # "hl.highlightMultiTerm": "true", }) - params = { - "query": { - "edismax": { - "query": query, - # qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input. - # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter - "qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10", - # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase. - # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter - "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20", - # Boosts - "bq": [], - "boost": [ - # The boost is multiplied with score -- calculating the log() reduces how quickly this increases - # the score for increasing clique identifier counts. - "log(sum(clique_identifier_count, 1))" - ], + if exact: + # Exact mode: bypass eDisMax entirely and use a filter query against the *_exactish fields. + # Filter queries are cached by Solr, making repeated lookups of the same term very fast. + string_lc_escaped = string_lc.replace('\\', '\\\\').replace('"', '\\"') + if exact == ExactMatchMode.label: + filters.append(f'preferred_name_exactish:"{string_lc_escaped}"') + elif exact == ExactMatchMode.synonyms: + filters.append(f'names_exactish:"{string_lc_escaped}"') + else: # ExactMatchMode.any + filters.append( + f'(preferred_name_exactish:"{string_lc_escaped}" OR names_exactish:"{string_lc_escaped}")' + ) + params = { + "query": "*:*", + "filter": filters, + "sort": "clique_identifier_count DESC, curie_suffix ASC", + "limit": limit, + "offset": offset, + "fields": "*, score", + "params": inner_params, + } + else: + params = { + "query": { + "edismax": { + "query": query, + # qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input. + # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter + "qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10", + # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase. + # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter + "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20", + # Boosts + "bq": [], + "boost": [ + # The boost is multiplied with score -- calculating the log() reduces how quickly this increases + # the score for increasing clique identifier counts. + "log(sum(clique_identifier_count, 1))" + ], + }, }, - }, - "sort": "score DESC, clique_identifier_count DESC, curie_suffix ASC", - "limit": limit, - "offset": offset, - "filter": filters, - "fields": "*, score", - "params": inner_params, - } + "sort": "score DESC, clique_identifier_count DESC, curie_suffix ASC", + "limit": limit, + "offset": offset, + "filter": filters, + "fields": "*, score", + "params": inner_params, + } logger.debug(f"Query: {json.dumps(params, indent=2)}") time_solr_start = time.perf_counter_ns() @@ -652,7 +694,7 @@ async def lookup(string: str, f"Lookup query to Solr for {json.dumps(string)} " f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, " f"biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, " - f"only_taxa={only_taxa}) " + f"only_taxa={only_taxa}, exact={exact}) " f"took {time_taken_ms:.2f}ms (with {solr_ms:.2f}ms waiting for Solr)" ) if time_taken_ms > SLOW_QUERY_THRESHOLD_MS: @@ -719,6 +761,12 @@ class NameResQuery(BaseModel): # We can't use `example` here because otherwise it gets filled in when filling this in. # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955" ) + exact: Optional[ExactMatchMode] = Field( + None, + description="Exact-match mode: 'label' matches the preferred name only, " + "'synonyms' matches any synonym, 'any' matches either. " + "Omit (or null) for the default fuzzy search.", + ) @app.post("/bulk-lookup", @@ -729,9 +777,9 @@ class NameResQuery(BaseModel): ) async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]: time_start = time.perf_counter_ns() - result = {} - for string in query.strings: - result[string] = await lookup( + + async def do_lookup(string: str): + results = await lookup( string, query.autocomplete, query.highlighting, @@ -740,7 +788,14 @@ async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]: query.biolink_types, query.only_prefixes, query.exclude_prefixes, - query.only_taxa) + query.only_taxa, + query.exact, + ) + return string, results + + pairs = await asyncio.gather(*[do_lookup(s) for s in query.strings]) + result = dict(pairs) + time_end = time.perf_counter_ns() logger.info(f"Bulk lookup query for {len(query.strings)} strings ({query}) took {(time_end - time_start)/1_000_000:.2f}ms") From 1387e38f0d78cca6d67368c58163b88c386e69b9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 2 Jun 2026 17:20:04 -0400 Subject: [PATCH 2/2] Add tests for exact match mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uses HP:0001300 (preferred_name="parkinsonian disorder", names=["Parkinsonian disease"]) as the test case, since its preferred name is absent from the names list — cleanly separating label-only vs. synonym-only matching. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_service.py | 69 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/test_service.py b/tests/test_service.py index d5522d7f..3779618c 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -223,3 +223,72 @@ def test_synonyms(): mondo_0000828_results = results['MONDO:0000828'] assert mondo_0000828_results['curie'] == 'MONDO:0000828' assert mondo_0000828_results['preferred_name'] == 'juvenile-onset Parkinson disease' + + +# HP:0001300 has preferred_name="parkinsonian disorder" and names=["Parkinsonian disease"]. +# The preferred_name is NOT in names, making it a good test case for label vs. synonyms exact mode. + +def test_exact_label_match(): + client = TestClient(app) + # "parkinsonian disorder" is the preferred label for HP:0001300 — label mode should find it. + response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'label', 'limit': 100}) + results = response.json() + curies = [r['curie'] for r in results] + assert 'HP:0001300' in curies + + # "Parkinsonian disease" is only a synonym (names entry), not the preferred label — label mode should NOT find it. + response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'label', 'limit': 100}) + results = response.json() + curies = [r['curie'] for r in results] + assert 'HP:0001300' not in curies + + +def test_exact_synonyms_match(): + client = TestClient(app) + # "Parkinsonian disease" is a synonym (names entry) for HP:0001300 — synonyms mode should find it. + response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'synonyms', 'limit': 100}) + results = response.json() + curies = [r['curie'] for r in results] + assert 'HP:0001300' in curies + + # "parkinsonian disorder" is the preferred_name but NOT in names for HP:0001300 — synonyms mode should NOT find it. + response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'synonyms', 'limit': 100}) + results = response.json() + curies = [r['curie'] for r in results] + assert 'HP:0001300' not in curies + + +def test_exact_any_match(): + client = TestClient(app) + # exact=any should match on either the preferred label or a synonym. + response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'any', 'limit': 100}) + curies = [r['curie'] for r in response.json()] + assert 'HP:0001300' in curies + + response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'any', 'limit': 100}) + curies = [r['curie'] for r in response.json()] + assert 'HP:0001300' in curies + + +def test_exact_no_partial_match(): + client = TestClient(app) + # "parkinson" is only a substring of known terms — exact mode must return no match for HP:0001300. + response = client.post("/lookup", params={'string': 'parkinson', 'exact': 'any', 'limit': 100}) + curies = [r['curie'] for r in response.json()] + assert 'HP:0001300' not in curies + + +def test_exact_bulk_lookup(): + client = TestClient(app) + params = { + 'strings': ['parkinsonian disorder', 'Parkinsonian disease', 'no match term xyz'], + 'exact': 'any', + 'limit': 10, + } + response = client.post("/bulk-lookup", json=params) + results = response.json() + + assert set(results.keys()) == {'parkinsonian disorder', 'Parkinsonian disease', 'no match term xyz'} + assert 'HP:0001300' in [r['curie'] for r in results['parkinsonian disorder']] + assert 'HP:0001300' in [r['curie'] for r in results['Parkinsonian disease']] + assert results['no match term xyz'] == []