Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 90 additions & 35 deletions api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* The curie with the shortest match is first, etc.
* Matching names are returned first, followed by non-matching names
"""
import asyncio
import json
import logging
import statistics
Expand All @@ -15,6 +16,7 @@
import os
import re
from collections import deque
from enum import Enum
from typing import Dict, List, Union, Annotated, Optional

from fastapi import Body, FastAPI, Query
Expand Down Expand Up @@ -330,6 +332,13 @@ async def curie_lookup(curies) -> Dict[str, Dict]:

return output

class ExactMatchMode(str, Enum):
"""Controls exact-match behaviour in lookup queries."""
label = "label" # match against preferred_name_exactish only
synonyms = "synonyms" # match against names_exactish only
any = "any" # match against either


class LookupResult(BaseModel):
curie:str
label: str
Expand Down Expand Up @@ -392,12 +401,17 @@ async def lookup_curies_get(
"e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
)] = None
)] = None,
exact: Annotated[Optional[ExactMatchMode], Query(
description="Exact-match mode: 'label' matches the preferred name only, "
"'synonyms' matches any synonym, 'any' matches either. "
"Omit for the default fuzzy search."
)] = None,
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
"""
return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, exact)


@app.post("/lookup",
Expand Down Expand Up @@ -451,12 +465,17 @@ async def lookup_curies_post(
"e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
)] = None
)] = None,
exact: Annotated[Optional[ExactMatchMode], Query(
description="Exact-match mode: 'label' matches the preferred name only, "
"'synonyms' matches any synonym, 'any' matches either. "
"Omit for the default fuzzy search."
)] = None,
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
"""
return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, exact)


async def lookup(string: str,
Expand All @@ -467,7 +486,8 @@ async def lookup(string: str,
biolink_types: List[str] = None,
only_prefixes: str = "",
exclude_prefixes: str = "",
only_taxa: str = ""
only_taxa: str = "",
exact: Optional[ExactMatchMode] = None,
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
Expand Down Expand Up @@ -564,32 +584,54 @@ async def lookup(string: str,
# "hl.highlightMultiTerm": "true",
})

params = {
"query": {
"edismax": {
"query": query,
# qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input.
# https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter
"qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10",
# pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase.
# https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter
"pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20",
# Boosts
"bq": [],
"boost": [
# The boost is multiplied with score -- calculating the log() reduces how quickly this increases
# the score for increasing clique identifier counts.
"log(sum(clique_identifier_count, 1))"
],
if exact:
# Exact mode: bypass eDisMax entirely and use a filter query against the *_exactish fields.
# Filter queries are cached by Solr, making repeated lookups of the same term very fast.
string_lc_escaped = string_lc.replace('\\', '\\\\').replace('"', '\\"')
if exact == ExactMatchMode.label:
filters.append(f'preferred_name_exactish:"{string_lc_escaped}"')
elif exact == ExactMatchMode.synonyms:
filters.append(f'names_exactish:"{string_lc_escaped}"')
else: # ExactMatchMode.any
filters.append(
f'(preferred_name_exactish:"{string_lc_escaped}" OR names_exactish:"{string_lc_escaped}")'
)
params = {
"query": "*:*",
"filter": filters,
"sort": "clique_identifier_count DESC, curie_suffix ASC",
"limit": limit,
"offset": offset,
"fields": "*, score",
"params": inner_params,
}
else:
params = {
"query": {
"edismax": {
"query": query,
# qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input.
# https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter
"qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10",
# pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase.
# https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter
"pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20",
# Boosts
"bq": [],
"boost": [
# The boost is multiplied with score -- calculating the log() reduces how quickly this increases
# the score for increasing clique identifier counts.
"log(sum(clique_identifier_count, 1))"
],
},
},
},
"sort": "score DESC, clique_identifier_count DESC, curie_suffix ASC",
"limit": limit,
"offset": offset,
"filter": filters,
"fields": "*, score",
"params": inner_params,
}
"sort": "score DESC, clique_identifier_count DESC, curie_suffix ASC",
"limit": limit,
"offset": offset,
"filter": filters,
"fields": "*, score",
"params": inner_params,
}
logger.debug(f"Query: {json.dumps(params, indent=2)}")

time_solr_start = time.perf_counter_ns()
Expand Down Expand Up @@ -652,7 +694,7 @@ async def lookup(string: str,
f"Lookup query to Solr for {json.dumps(string)} "
f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, "
f"biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, "
f"only_taxa={only_taxa}) "
f"only_taxa={only_taxa}, exact={exact}) "
f"took {time_taken_ms:.2f}ms (with {solr_ms:.2f}ms waiting for Solr)"
)
if time_taken_ms > SLOW_QUERY_THRESHOLD_MS:
Expand Down Expand Up @@ -719,6 +761,12 @@ class NameResQuery(BaseModel):
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
)
exact: Optional[ExactMatchMode] = Field(
None,
description="Exact-match mode: 'label' matches the preferred name only, "
"'synonyms' matches any synonym, 'any' matches either. "
"Omit (or null) for the default fuzzy search.",
)


@app.post("/bulk-lookup",
Expand All @@ -729,9 +777,9 @@ class NameResQuery(BaseModel):
)
async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]:
time_start = time.perf_counter_ns()
result = {}
for string in query.strings:
result[string] = await lookup(

async def do_lookup(string: str):
results = await lookup(
string,
query.autocomplete,
query.highlighting,
Expand All @@ -740,7 +788,14 @@ async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]:
query.biolink_types,
query.only_prefixes,
query.exclude_prefixes,
query.only_taxa)
query.only_taxa,
query.exact,
)
return string, results

pairs = await asyncio.gather(*[do_lookup(s) for s in query.strings])
result = dict(pairs)

time_end = time.perf_counter_ns()
logger.info(f"Bulk lookup query for {len(query.strings)} strings ({query}) took {(time_end - time_start)/1_000_000:.2f}ms")

Expand Down
69 changes: 69 additions & 0 deletions tests/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,72 @@ def test_synonyms():
mondo_0000828_results = results['MONDO:0000828']
assert mondo_0000828_results['curie'] == 'MONDO:0000828'
assert mondo_0000828_results['preferred_name'] == 'juvenile-onset Parkinson disease'


# HP:0001300 has preferred_name="parkinsonian disorder" and names=["Parkinsonian disease"].
# The preferred_name is NOT in names, making it a good test case for label vs. synonyms exact mode.

def test_exact_label_match():
client = TestClient(app)
# "parkinsonian disorder" is the preferred label for HP:0001300 — label mode should find it.
response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'label', 'limit': 100})
results = response.json()
curies = [r['curie'] for r in results]
assert 'HP:0001300' in curies

# "Parkinsonian disease" is only a synonym (names entry), not the preferred label — label mode should NOT find it.
response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'label', 'limit': 100})
results = response.json()
curies = [r['curie'] for r in results]
assert 'HP:0001300' not in curies


def test_exact_synonyms_match():
client = TestClient(app)
# "Parkinsonian disease" is a synonym (names entry) for HP:0001300 — synonyms mode should find it.
response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'synonyms', 'limit': 100})
results = response.json()
curies = [r['curie'] for r in results]
assert 'HP:0001300' in curies

# "parkinsonian disorder" is the preferred_name but NOT in names for HP:0001300 — synonyms mode should NOT find it.
response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'synonyms', 'limit': 100})
results = response.json()
curies = [r['curie'] for r in results]
assert 'HP:0001300' not in curies


def test_exact_any_match():
client = TestClient(app)
# exact=any should match on either the preferred label or a synonym.
response = client.post("/lookup", params={'string': 'parkinsonian disorder', 'exact': 'any', 'limit': 100})
curies = [r['curie'] for r in response.json()]
assert 'HP:0001300' in curies

response = client.post("/lookup", params={'string': 'Parkinsonian disease', 'exact': 'any', 'limit': 100})
curies = [r['curie'] for r in response.json()]
assert 'HP:0001300' in curies


def test_exact_no_partial_match():
client = TestClient(app)
# "parkinson" is only a substring of known terms — exact mode must return no match for HP:0001300.
response = client.post("/lookup", params={'string': 'parkinson', 'exact': 'any', 'limit': 100})
curies = [r['curie'] for r in response.json()]
assert 'HP:0001300' not in curies


def test_exact_bulk_lookup():
client = TestClient(app)
params = {
'strings': ['parkinsonian disorder', 'Parkinsonian disease', 'no match term xyz'],
'exact': 'any',
'limit': 10,
}
response = client.post("/bulk-lookup", json=params)
results = response.json()

assert set(results.keys()) == {'parkinsonian disorder', 'Parkinsonian disease', 'no match term xyz'}
assert 'HP:0001300' in [r['curie'] for r in results['parkinsonian disorder']]
assert 'HP:0001300' in [r['curie'] for r in results['Parkinsonian disease']]
assert results['no match term xyz'] == []
Loading