From 6b561cc0ee607109669d3f93e2c532f1f16a56a1 Mon Sep 17 00:00:00 2001 From: shauryaraghav Date: Sun, 24 May 2026 11:37:37 +0530 Subject: [PATCH 1/2] Add Indian UPI ID recognizer for NPCI payment compliance --- code in_upi_recognizer.py | 65 +++++++++++++++++++ .../predefined_recognizers/__init__.py | 2 + .../country_specific/india/__init__.py | 5 +- .../india/in_upi_recognizer.py | 65 +++++++++++++++++++ .../tests/test_in_upi_recognizer.py | 56 ++++++++++++++++ 5 files changed, 191 insertions(+), 2 deletions(-) create mode 100644 code in_upi_recognizer.py create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_upi_recognizer.py create mode 100644 presidio-analyzer/tests/test_in_upi_recognizer.py diff --git a/code in_upi_recognizer.py b/code in_upi_recognizer.py new file mode 100644 index 0000000000..d8831b373a --- /dev/null +++ b/code in_upi_recognizer.py @@ -0,0 +1,65 @@ +from typing import List, Optional, Tuple +from presidio_analyzer import Pattern, PatternRecognizer + + +class InUpiRecognizer(PatternRecognizer): + """ + Recognizes Indian UPI (Unified Payments Interface) IDs. + + UPI IDs are used for digital payments in India and follow the format: + username@bankhandle (e.g., shaurya@okicici, 9876543210@paytm) + + Common UPI handles include: okicici, okhdfcbank, okaxis, paytm, + ybl, upi, apl, ibl, axl, timecosmos, waicici, wahdfcbank + + This recognizer identifies UPI IDs using regex and context words. + Reference: https://www.npci.org.in/what-we-do/upi/product-overview + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + COUNTRY_CODE = "in" + + PATTERNS = [ + Pattern( + "UPI ID (High)", + r"\b([a-zA-Z0-9.\-_]{2,256}@(okicici|okhdfcbank|okaxis|oksbi|paytm|ybl|upi|apl|ibl|axl|waicici|wahdfcbank|timecosmos|rapl|mbk|ikwik|freecharge))\b", + 0.7, + ), + Pattern( + "UPI ID (Medium)", + r"\b([a-zA-Z0-9.\-_]{2,256}@[a-zA-Z]{2,64})\b", + 0.4, + ), + ] + + CONTEXT = [ + "upi", + "upi id", + "payment", + "gpay", + "phonepe", + "paytm", + "bhim", + "transfer", + "pay", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "IN_UPI", + ): + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 09643bcc45..c0ec6dafe9 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -50,6 +50,7 @@ from .country_specific.india.in_pan_recognizer import InPanRecognizer from .country_specific.india.in_passport_recognizer import InPassportRecognizer from .country_specific.india.in_voter_recognizer import InVoterRecognizer +from .country_specific.india.in_upi_recognizer import InUpiRecognizer # Italy recognizers from .country_specific.italy.it_driver_license_recognizer import ( @@ -221,6 +222,7 @@ "SgUenRecognizer", "InVoterRecognizer", "InPassportRecognizer", + "InUpiRecognizer", "FiPersonalIdentityCodeRecognizer", "EsNieRecognizer", "EsPassportRecognizer", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/__init__.py index 7cec47f7e4..1f2f6d60f8 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/__init__.py @@ -1,11 +1,11 @@ """India-specific recognizers.""" - from .in_aadhaar_recognizer import InAadhaarRecognizer from .in_gstin_recognizer import InGstinRecognizer from .in_pan_recognizer import InPanRecognizer from .in_passport_recognizer import InPassportRecognizer from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer from .in_voter_recognizer import InVoterRecognizer +from .in_upi_recognizer import InUpiRecognizer __all__ = [ "InAadhaarRecognizer", @@ -14,4 +14,5 @@ "InVoterRecognizer", "InVehicleRegistrationRecognizer", "InPassportRecognizer", -] + "InUpiRecognizer", +] \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_upi_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_upi_recognizer.py new file mode 100644 index 0000000000..604468f5c2 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_upi_recognizer.py @@ -0,0 +1,65 @@ +from typing import List, Optional +from presidio_analyzer import Pattern, PatternRecognizer + + +class InUpiRecognizer(PatternRecognizer): + """ + Recognizes Indian UPI (Unified Payments Interface) IDs. + + UPI IDs are used for digital payments in India and follow the format: + username@bankhandle (e.g., shaurya@okicici, 9876543210@paytm) + + Common UPI handles include: okicici, okhdfcbank, okaxis, paytm, + ybl, upi, apl, ibl, axl, timecosmos, waicici, wahdfcbank + + This recognizer identifies UPI IDs using regex and context words. + Reference: https://www.npci.org.in/what-we-do/upi/product-overview + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + COUNTRY_CODE = "in" + + PATTERNS = [ + Pattern( + "UPI ID (High)", + r"\b([a-zA-Z0-9.\-_]{2,256}@(okicici|okhdfcbank|okaxis|oksbi|paytm|ybl|upi|apl|ibl|axl|waicici|wahdfcbank|timecosmos|rapl|mbk|ikwik|freecharge))\b", + 0.7, + ), + Pattern( + "UPI ID (Medium)", + r"\b([a-zA-Z0-9.\-_]{2,256}@[a-zA-Z]{2,64})\b", + 0.4, + ), + ] + + CONTEXT = [ + "upi", + "upi id", + "payment", + "gpay", + "phonepe", + "paytm", + "bhim", + "transfer", + "pay", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "IN_UPI", + ): + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) \ No newline at end of file diff --git a/presidio-analyzer/tests/test_in_upi_recognizer.py b/presidio-analyzer/tests/test_in_upi_recognizer.py new file mode 100644 index 0000000000..a910efec8a --- /dev/null +++ b/presidio-analyzer/tests/test_in_upi_recognizer.py @@ -0,0 +1,56 @@ +import pytest +from tests import assert_result +from presidio_analyzer.predefined_recognizers import InUpiRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return InUpiRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["IN_UPI"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_position, expected_score", + [ + # fmt: off + # Valid UPI IDs with known handles (High confidence) + ("shaurya@okicici", 1, (0, 15), 0.7), + ("9876543210@paytm", 1, (0, 16), 0.7), + ("john.doe@okhdfcbank", 1, (0, 19), 0.7), + ("user123@ybl", 1, (0, 11), 0.7), + + # Valid UPI IDs with unknown handles (Medium confidence) + ("myname@somebank", 1, (0, 15), 0.4), + + # Invalid UPI IDs + ("notaupiid", 0, (), ()), + ("@okicici", 0, (), ()), + + # UPI in sentence + ("Please pay to shaurya@okicici for the order", 1, (14, 29), 0.7), + # fmt: on + ], +) +def test_when_upi_in_text_then_all_upis_found( + text, + expected_len, + expected_position, + expected_score, + recognizer, + entities, +): + results = recognizer.analyze(text, entities) + print(results) + assert len(results) == expected_len + if results: + assert_result( + results[0], + entities[0], + expected_position[0], + expected_position[1], + expected_score, + ) \ No newline at end of file From e55bc51e1710d04a41e7743ccff96398a8e8f736 Mon Sep 17 00:00:00 2001 From: shauryaraghav Date: Sun, 24 May 2026 11:48:33 +0530 Subject: [PATCH 2/2] Remove accidentally committed file --- code in_upi_recognizer.py | 65 --------------------------------------- 1 file changed, 65 deletions(-) delete mode 100644 code in_upi_recognizer.py diff --git a/code in_upi_recognizer.py b/code in_upi_recognizer.py deleted file mode 100644 index d8831b373a..0000000000 --- a/code in_upi_recognizer.py +++ /dev/null @@ -1,65 +0,0 @@ -from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer - - -class InUpiRecognizer(PatternRecognizer): - """ - Recognizes Indian UPI (Unified Payments Interface) IDs. - - UPI IDs are used for digital payments in India and follow the format: - username@bankhandle (e.g., shaurya@okicici, 9876543210@paytm) - - Common UPI handles include: okicici, okhdfcbank, okaxis, paytm, - ybl, upi, apl, ibl, axl, timecosmos, waicici, wahdfcbank - - This recognizer identifies UPI IDs using regex and context words. - Reference: https://www.npci.org.in/what-we-do/upi/product-overview - - :param patterns: List of patterns to be used by this recognizer - :param context: List of context words to increase confidence in detection - :param supported_language: Language this recognizer supports - :param supported_entity: The entity this recognizer can detect - """ - - COUNTRY_CODE = "in" - - PATTERNS = [ - Pattern( - "UPI ID (High)", - r"\b([a-zA-Z0-9.\-_]{2,256}@(okicici|okhdfcbank|okaxis|oksbi|paytm|ybl|upi|apl|ibl|axl|waicici|wahdfcbank|timecosmos|rapl|mbk|ikwik|freecharge))\b", - 0.7, - ), - Pattern( - "UPI ID (Medium)", - r"\b([a-zA-Z0-9.\-_]{2,256}@[a-zA-Z]{2,64})\b", - 0.4, - ), - ] - - CONTEXT = [ - "upi", - "upi id", - "payment", - "gpay", - "phonepe", - "paytm", - "bhim", - "transfer", - "pay", - ] - - def __init__( - self, - patterns: Optional[List[Pattern]] = None, - context: Optional[List[str]] = None, - supported_language: str = "en", - supported_entity: str = "IN_UPI", - ): - patterns = patterns if patterns else self.PATTERNS - context = context if context else self.CONTEXT - super().__init__( - supported_entity=supported_entity, - patterns=patterns, - context=context, - supported_language=supported_language, - ) \ No newline at end of file