diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 09643bcc4..c0ec6dafe 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -50,6 +50,7 @@ from .country_specific.india.in_pan_recognizer import InPanRecognizer from .country_specific.india.in_passport_recognizer import InPassportRecognizer from .country_specific.india.in_voter_recognizer import InVoterRecognizer +from .country_specific.india.in_upi_recognizer import InUpiRecognizer # Italy recognizers from .country_specific.italy.it_driver_license_recognizer import ( @@ -221,6 +222,7 @@ "SgUenRecognizer", "InVoterRecognizer", "InPassportRecognizer", + "InUpiRecognizer", "FiPersonalIdentityCodeRecognizer", "EsNieRecognizer", "EsPassportRecognizer", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/__init__.py index 7cec47f7e..1f2f6d60f 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/__init__.py @@ -1,11 +1,11 @@ """India-specific recognizers.""" - from .in_aadhaar_recognizer import InAadhaarRecognizer from .in_gstin_recognizer import InGstinRecognizer from .in_pan_recognizer import InPanRecognizer from .in_passport_recognizer import InPassportRecognizer from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer from .in_voter_recognizer import InVoterRecognizer +from .in_upi_recognizer import InUpiRecognizer __all__ = [ "InAadhaarRecognizer", @@ -14,4 +14,5 @@ "InVoterRecognizer", "InVehicleRegistrationRecognizer", "InPassportRecognizer", -] + "InUpiRecognizer", +] \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_upi_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_upi_recognizer.py new file mode 100644 index 000000000..604468f5c --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_upi_recognizer.py @@ -0,0 +1,65 @@ +from typing import List, Optional +from presidio_analyzer import Pattern, PatternRecognizer + + +class InUpiRecognizer(PatternRecognizer): + """ + Recognizes Indian UPI (Unified Payments Interface) IDs. + + UPI IDs are used for digital payments in India and follow the format: + username@bankhandle (e.g., shaurya@okicici, 9876543210@paytm) + + Common UPI handles include: okicici, okhdfcbank, okaxis, paytm, + ybl, upi, apl, ibl, axl, timecosmos, waicici, wahdfcbank + + This recognizer identifies UPI IDs using regex and context words. + Reference: https://www.npci.org.in/what-we-do/upi/product-overview + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + COUNTRY_CODE = "in" + + PATTERNS = [ + Pattern( + "UPI ID (High)", + r"\b([a-zA-Z0-9.\-_]{2,256}@(okicici|okhdfcbank|okaxis|oksbi|paytm|ybl|upi|apl|ibl|axl|waicici|wahdfcbank|timecosmos|rapl|mbk|ikwik|freecharge))\b", + 0.7, + ), + Pattern( + "UPI ID (Medium)", + r"\b([a-zA-Z0-9.\-_]{2,256}@[a-zA-Z]{2,64})\b", + 0.4, + ), + ] + + CONTEXT = [ + "upi", + "upi id", + "payment", + "gpay", + "phonepe", + "paytm", + "bhim", + "transfer", + "pay", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "IN_UPI", + ): + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) \ No newline at end of file diff --git a/presidio-analyzer/tests/test_in_upi_recognizer.py b/presidio-analyzer/tests/test_in_upi_recognizer.py new file mode 100644 index 000000000..a910efec8 --- /dev/null +++ b/presidio-analyzer/tests/test_in_upi_recognizer.py @@ -0,0 +1,56 @@ +import pytest +from tests import assert_result +from presidio_analyzer.predefined_recognizers import InUpiRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return InUpiRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["IN_UPI"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_position, expected_score", + [ + # fmt: off + # Valid UPI IDs with known handles (High confidence) + ("shaurya@okicici", 1, (0, 15), 0.7), + ("9876543210@paytm", 1, (0, 16), 0.7), + ("john.doe@okhdfcbank", 1, (0, 19), 0.7), + ("user123@ybl", 1, (0, 11), 0.7), + + # Valid UPI IDs with unknown handles (Medium confidence) + ("myname@somebank", 1, (0, 15), 0.4), + + # Invalid UPI IDs + ("notaupiid", 0, (), ()), + ("@okicici", 0, (), ()), + + # UPI in sentence + ("Please pay to shaurya@okicici for the order", 1, (14, 29), 0.7), + # fmt: on + ], +) +def test_when_upi_in_text_then_all_upis_found( + text, + expected_len, + expected_position, + expected_score, + recognizer, + entities, +): + results = recognizer.analyze(text, entities) + print(results) + assert len(results) == expected_len + if results: + assert_result( + results[0], + entities[0], + expected_position[0], + expected_position[1], + expected_score, + ) \ No newline at end of file