Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from .country_specific.india.in_pan_recognizer import InPanRecognizer
from .country_specific.india.in_passport_recognizer import InPassportRecognizer
from .country_specific.india.in_voter_recognizer import InVoterRecognizer
from .country_specific.india.in_upi_recognizer import InUpiRecognizer

# Italy recognizers
from .country_specific.italy.it_driver_license_recognizer import (
Expand Down Expand Up @@ -221,6 +222,7 @@
"SgUenRecognizer",
"InVoterRecognizer",
"InPassportRecognizer",
"InUpiRecognizer",
"FiPersonalIdentityCodeRecognizer",
"EsNieRecognizer",
"EsPassportRecognizer",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""India-specific recognizers."""

from .in_aadhaar_recognizer import InAadhaarRecognizer
from .in_gstin_recognizer import InGstinRecognizer
from .in_pan_recognizer import InPanRecognizer
from .in_passport_recognizer import InPassportRecognizer
from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer
from .in_voter_recognizer import InVoterRecognizer
from .in_upi_recognizer import InUpiRecognizer

__all__ = [
"InAadhaarRecognizer",
Expand All @@ -14,4 +14,5 @@
"InVoterRecognizer",
"InVehicleRegistrationRecognizer",
"InPassportRecognizer",
]
"InUpiRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import List, Optional
from presidio_analyzer import Pattern, PatternRecognizer


class InUpiRecognizer(PatternRecognizer):
"""
Recognizes Indian UPI (Unified Payments Interface) IDs.

UPI IDs are used for digital payments in India and follow the format:
username@bankhandle (e.g., shaurya@okicici, 9876543210@paytm)

Common UPI handles include: okicici, okhdfcbank, okaxis, paytm,
ybl, upi, apl, ibl, axl, timecosmos, waicici, wahdfcbank

This recognizer identifies UPI IDs using regex and context words.
Reference: https://www.npci.org.in/what-we-do/upi/product-overview

:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
"""
Comment on lines +5 to +22

COUNTRY_CODE = "in"

PATTERNS = [
Pattern(
"UPI ID (High)",
r"\b([a-zA-Z0-9.\-_]{2,256}@(okicici|okhdfcbank|okaxis|oksbi|paytm|ybl|upi|apl|ibl|axl|waicici|wahdfcbank|timecosmos|rapl|mbk|ikwik|freecharge))\b",
0.7,
),
Pattern(
"UPI ID (Medium)",
r"\b([a-zA-Z0-9.\-_]{2,256}@[a-zA-Z]{2,64})\b",
0.4,
Comment on lines +29 to +35
),
]

CONTEXT = [
"upi",
"upi id",
"payment",
"gpay",
"phonepe",
"paytm",
"bhim",
"transfer",
"pay",
]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "IN_UPI",
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)
56 changes: 56 additions & 0 deletions presidio-analyzer/tests/test_in_upi_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pytest
from tests import assert_result
from presidio_analyzer.predefined_recognizers import InUpiRecognizer


@pytest.fixture(scope="module")
def recognizer():
return InUpiRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["IN_UPI"]


@pytest.mark.parametrize(
"text, expected_len, expected_position, expected_score",
[
# fmt: off
# Valid UPI IDs with known handles (High confidence)
("shaurya@okicici", 1, (0, 15), 0.7),
("9876543210@paytm", 1, (0, 16), 0.7),
("john.doe@okhdfcbank", 1, (0, 19), 0.7),
("user123@ybl", 1, (0, 11), 0.7),

# Valid UPI IDs with unknown handles (Medium confidence)
("myname@somebank", 1, (0, 15), 0.4),

# Invalid UPI IDs
("notaupiid", 0, (), ()),
("@okicici", 0, (), ()),
Comment on lines +29 to +31

# UPI in sentence
("Please pay to shaurya@okicici for the order", 1, (14, 29), 0.7),
# fmt: on
],
)
def test_when_upi_in_text_then_all_upis_found(
text,
expected_len,
expected_position,
expected_score,
recognizer,
entities,
):
results = recognizer.analyze(text, entities)
print(results)
assert len(results) == expected_len
if results:
assert_result(
results[0],
entities[0],
expected_position[0],
expected_position[1],
expected_score,
)