From 6700ce9541c37b2b15e3f796d331dc2c096d1375 Mon Sep 17 00:00:00 2001 From: Mirza-Samad-Ahmed-Baig Date: Fri, 22 May 2026 13:27:12 +0300 Subject: [PATCH] fix: validate GSTIN checksum --- CHANGELOG.md | 1 + .../india/in_gstin_recognizer.py | 50 ++++--- .../tests/test_in_gstin_recognizer.py | 127 ++++++++++-------- 3 files changed, 100 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 619f7d2858..d247cd720d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ All notable changes to this project will be documented in this file. - Added `supported_entity` parameter to `PhoneRecognizer`. Previously, this recognizer hard-coded `["PHONE_NUMBER"]` as the only possible supported entity. #### Fixed +- Added Luhn mod-36 checksum validation to the Indian GSTIN recognizer, reducing false positives for structurally valid but mistyped GSTINs. - Fixed incorrect Prüfziffer algorithm in `DeHealthInsuranceRecognizer` (KVNR); now uses alternating factors [1,2,…,1,2] per § 290 SGB V Anlage 1 (#1972). - Fixed incorrect check-digit weights in `DeSocialSecurityRecognizer` (RVNR); now uses VKVV § 4 weights [2,1,2,5,7,1,2,1,2,1,2,1]. Previous weights diverged from the Deutsche Rentenversicherung specification and rejected the canonical DRV example 15070649C103. - Fixed incorrect check-digit algorithm in `DeLanrRecognizer`; now uses KBV Arztnummern-Richtlinie weights [4,9,4,9,4,9] without the spurious Quersumme step, and the complement-to-10 formula `(10 − sum mod 10) mod 10`. Previous weights and formula were internally self-consistent only. diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py index ef965c9322..63f5338af1 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py @@ -1,3 +1,4 @@ +import re from typing import List, Optional, Tuple from presidio_analyzer import Pattern, PatternRecognizer @@ -28,10 +29,16 @@ class InGstinRecognizer(PatternRecognizer): COUNTRY_CODE = "in" + GSTIN_CHARSET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + GSTIN_PATTERN = re.compile( + r"^(?:0[1-9]|[1-3][0-7])[A-Z]{5}[0-9]{4}[A-Z][1-9A-Z]Z[0-9A-Z]$" + ) + PATTERNS = [ Pattern( "GSTIN (High)", - r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z0-9]{10}[A-Za-z0-9]{1}Z[A-Za-z0-9]{1})\b", + r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z]{5}[0-9]{4}" + r"[A-Za-z][1-9A-Za-z]Z[A-Za-z0-9])\b", 0.8, ), Pattern( @@ -90,12 +97,10 @@ def validate_result(self, pattern_text: str) -> bool: def _sanitize_value(self, text: str) -> str: """Remove common separators and normalize the text.""" - import re - # First, try to extract GSTIN pattern from the text gstin_pattern = ( r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z]{5}[0-9]{4}[A-Za-z]{1}" - r"[0-9A-Za-z]{1}Z[0-9A-Za-z]{1})\b" + r"[1-9A-Za-z]{1}Z[0-9A-Za-z]{1})\b" ) match = re.search(gstin_pattern, text.upper()) if match: @@ -122,26 +127,10 @@ def _validate_gstin(self, gstin: str) -> bool: if not state_code.isdigit() or not (1 <= int(state_code) <= 37): return False - # Check PAN format (characters 3-12) - pan_part = gstin[2:12] - if not self._validate_pan_format(pan_part): - return False - - # Check 13th character (registration number) - reg_number = gstin[12] - if not reg_number.isalnum(): + if not self.GSTIN_PATTERN.fullmatch(gstin): return False - # Check 14th character should be 'Z' - if gstin[13] != "Z": - return False - - # Check 15th character (checksum) - checksum = gstin[14] - if not checksum.isalnum(): - return False - - return True + return self._validate_checksum(gstin) def _validate_pan_format(self, pan: str) -> bool: """ @@ -172,3 +161,20 @@ def _validate_pan_format(self, pan: str) -> bool: return False return True + + def _validate_checksum(self, gstin: str) -> bool: + """ + Validate the GSTIN check digit using the Luhn mod-36 algorithm. + + The first 14 characters are transformed into base-36 values and + multiplied by alternating factors of 1 and 2. The GSTIN is valid only + when the calculated check character matches the 15th character. + """ + total = 0 + for index, char in enumerate(gstin[:14]): + codepoint = self.GSTIN_CHARSET.index(char) + product = codepoint * ((index % 2) + 1) + total += (product // 36) + (product % 36) + + check_codepoint = (36 - (total % 36)) % 36 + return self.GSTIN_CHARSET[check_codepoint] == gstin[14] diff --git a/presidio-analyzer/tests/test_in_gstin_recognizer.py b/presidio-analyzer/tests/test_in_gstin_recognizer.py index 5a32a36eee..be2f53ef94 100644 --- a/presidio-analyzer/tests/test_in_gstin_recognizer.py +++ b/presidio-analyzer/tests/test_in_gstin_recognizer.py @@ -1,7 +1,9 @@ import pytest from tests import assert_result -from presidio_analyzer.predefined_recognizers.country_specific.india.in_gstin_recognizer import InGstinRecognizer +from presidio_analyzer.predefined_recognizers.country_specific.india.in_gstin_recognizer import ( + InGstinRecognizer, +) @pytest.fixture(scope="module") @@ -18,33 +20,29 @@ def entities(): "text, expected_len, expected_position, expected_score", [ # Valid GSTINs with high confidence - ("27ABCDE1234F1Z5", 1, (0, 15), 1.0), - ("07PQRST6789K1Z2", 1, (0, 15), 1.0), - ("01ABCDE1234F1Z5", 1, (0, 15), 1.0), - ("37ABCDE1234F1Z5", 1, (0, 15), 1.0), - - # Valid GSTINs with medium confidence (different PAN format) - ("27ABCDE1234F1Z5", 1, (0, 15), 1.0), - ("07PQRST6789K1Z2", 1, (0, 15), 1.0), - - # Valid GSTINs with low confidence (generic pattern) - ("27ABCDE1234F1Z5", 1, (0, 15), 1.0), - + ("27AAPFU0939F1ZV", 1, (0, 15), 1.0), + ("07AAACR5055K1Z5", 1, (0, 15), 1.0), + ("29AAGCB7383J1Z4", 1, (0, 15), 1.0), + ("27AASCS2460H1Z0", 1, (0, 15), 1.0), # GSTIN with context - ("My GSTIN number is 27ABCDE1234F1Z5 for business registration", 1, (19, 34), 1.0), - ("GST registration: 07PQRST6789K1Z2", 1, (18, 33), 1.0), - ("Tax identification GSTIN: 01ABCDE1234F1Z5", 1, (26, 41), 1.0), - + ( + "My GSTIN number is 27AAPFU0939F1ZV for business registration", + 1, + (19, 34), + 1.0, + ), + ("GST registration: 07AAACR5055K1Z5", 1, (18, 33), 1.0), + ("Tax identification GSTIN: 29AAGCB7383J1Z4", 1, (26, 41), 1.0), # Multiple GSTINs - ("GSTINs: 27ABCDE1234F1Z5 and 07PQRST6789K1Z2", 2, (8, 23), 1.0), - + ("GSTINs: 27AAPFU0939F1ZV and 07AAACR5055K1Z5", 2, (8, 23), 1.0), # Invalid GSTINs (should not be detected) - ("27ABCDE1234F1Z", 0, (), ()), # Too short - ("27ABCDE1234F1Z55", 0, (), ()), # Too long - ("00ABCDE1234F1Z5", 0, (), ()), # Invalid state code - ("38ABCDE1234F1Z5", 0, (), ()), # Invalid state code - ("27ABCDE1234F1Y5", 0, (), ()), # Missing 'Z' at position 14 - ("27ABCDE1234F1Z", 0, (), ()), # Missing checksum + ("27AAPFU0939F1Z", 0, (), ()), # Too short + ("27AAPFU0939F1ZVV", 0, (), ()), # Too long + ("00AAPFU0939F1ZV", 0, (), ()), # Invalid state code + ("38AAPFU0939F1ZV", 0, (), ()), # Invalid state code + ("27AAPFU0939F1YV", 0, (), ()), # Missing 'Z' at position 14 + ("27AAPFU0939F1ZU", 0, (), ()), # Invalid checksum + ("27AAPFU0939F0ZV", 0, (), ()), # Invalid registration character ], ) def test_when_gstin_in_text_then_all_gstins_found( @@ -75,13 +73,15 @@ def test_when_gstin_in_text_then_all_gstins_found( ("", 0), ("123456789012345", 0), # All digits ("ABCDEFGHIJKLMNO", 0), # All letters - ("27ABCDE1234F1Z", 0), # Too short - ("27ABCDE1234F1Z55", 0), # Too long - ("00ABCDE1234F1Z5", 0), # Invalid state code (00) - ("38ABCDE1234F1Z5", 0), # Invalid state code (38) - ("27ABCDE1234F1Y5", 0), # Missing 'Z' at position 14 - ("27ABCDE1234F1Z", 0), # Missing checksum - ("27ABCDE1234F1Z5", 1), # Valid GSTIN + ("27AAPFU0939F1Z", 0), # Too short + ("27AAPFU0939F1ZVV", 0), # Too long + ("00AAPFU0939F1ZV", 0), # Invalid state code (00) + ("38AAPFU0939F1ZV", 0), # Invalid state code (38) + ("27AAPFU0939F1YV", 0), # Missing 'Z' at position 14 + ("27AAPFU0939F1Z", 0), # Missing checksum + ("27AAPFU0939F1ZU", 0), # Wrong checksum + ("27AAPFU0939F0ZV", 0), # Invalid registration character + ("27AAPFU0939F1ZV", 1), # Valid GSTIN ], ) def test_gstin_validation(text, expected_len, recognizer, entities): @@ -94,20 +94,20 @@ def test_gstin_validation(text, expected_len, recognizer, entities): "gstin, expected", [ # Valid GSTINs - ("27ABCDE1234F1Z5", True), - ("07PQRST6789K1Z2", True), - ("01ABCDE1234F1Z5", True), - ("37ABCDE1234F1Z5", True), - + ("27AAPFU0939F1ZV", True), + ("07AAACR5055K1Z5", True), + ("29AAGCB7383J1Z4", True), + ("27AASCS2460H1Z0", True), + ("27aapfu0939f1zv", True), # Valid with different case # Invalid GSTINs - ("27ABCDE1234F1Z", False), # Too short - ("27ABCDE1234F1Z55", False), # Too long - ("00ABCDE1234F1Z5", False), # Invalid state code - ("38ABCDE1234F1Z5", False), # Invalid state code - ("27ABCDE1234F1Y5", False), # Missing 'Z' at position 14 - ("27ABCDE1234F1Z", False), # Missing checksum - ("27ABCDE1234F1Z5", True), # Valid - ("27ABCDE1234F1Z5", True), # Valid with different case + ("27AAPFU0939F1Z", False), # Too short + ("27AAPFU0939F1ZVV", False), # Too long + ("00AAPFU0939F1ZV", False), # Invalid state code + ("38AAPFU0939F1ZV", False), # Invalid state code + ("27AAPFU0939F1YV", False), # Missing 'Z' at position 14 + ("27AAPFU0939F1Z", False), # Missing checksum + ("27AAPFU0939F1ZU", False), # Wrong checksum + ("27AAPFU0939F0ZV", False), # Invalid registration character ], ) def test_validate_result(gstin, expected, recognizer): @@ -141,11 +141,14 @@ def test_validate_pan_format(pan, expected, recognizer): @pytest.mark.parametrize( "text, expected", [ - ("27ABCDE1234F1Z5", "27ABCDE1234F1Z5"), - ("27ABCDE1234F1Z5", "27ABCDE1234F1Z5"), - ("27-ABCDE-1234-F1-Z5", "27ABCDE1234F1Z5"), - ("27 ABCDE 1234 F1 Z5", "27ABCDE1234F1Z5"), - ("The company GSTIN is 27ABCDE1234F1Z5 for tax purposes", "27ABCDE1234F1Z5"), + ("27AAPFU0939F1ZV", "27AAPFU0939F1ZV"), + ("27aapfu0939f1zv", "27AAPFU0939F1ZV"), + ("27-AAPFU-0939-F1-ZV", "27AAPFU0939F1ZV"), + ("27 AAPFU 0939 F1 ZV", "27AAPFU0939F1ZV"), + ( + "The company GSTIN is 27AAPFU0939F1ZV for tax purposes", + "27AAPFU0939F1ZV", + ), ], ) def test_sanitize_value(text, expected, recognizer): @@ -170,9 +173,7 @@ def test_gstin_recognizer_with_custom_params(): """Test GSTIN recognizer initialization with custom parameters.""" custom_context = ["custom", "context"] recognizer = InGstinRecognizer( - context=custom_context, - supported_language="hi", - supported_entity="CUSTOM_GSTIN" + context=custom_context, supported_language="hi", supported_entity="CUSTOM_GSTIN" ) assert recognizer.supported_entity == "CUSTOM_GSTIN" @@ -186,7 +187,21 @@ def test_gstin_recognizer_replacement_pairs(): recognizer = InGstinRecognizer(replacement_pairs=custom_replacement_pairs) assert recognizer.replacement_pairs == custom_replacement_pairs - + # Test sanitization with custom replacement pairs - result = recognizer._sanitize_value("27-ABCDE-1234-F1-Z5") - assert result == "27ABCDE1234F1Z5" + result = recognizer._sanitize_value("27-AAPFU-0939-F1-ZV") + assert result == "27AAPFU0939F1ZV" + + +@pytest.mark.parametrize( + "gstin, expected", + [ + ("27AAPFU0939F1ZV", True), + ("27AAPFU0939F1ZU", False), + ("07AAACR5055K1Z5", True), + ("07AAACR5055K1Z4", False), + ], +) +def test_validate_checksum(gstin, expected, recognizer): + """Test GSTIN Luhn mod-36 checksum validation.""" + assert recognizer._validate_checksum(gstin) == expected