From 6700ce9541c37b2b15e3f796d331dc2c096d1375 Mon Sep 17 00:00:00 2001
From: Mirza-Samad-Ahmed-Baig <Mirzasamadahmedbaig@gmail.com>
Date: Fri, 22 May 2026 13:27:12 +0300
Subject: [PATCH] fix: validate GSTIN checksum

---
 CHANGELOG.md                                  |   1 +
 .../india/in_gstin_recognizer.py              |  50 ++++---
 .../tests/test_in_gstin_recognizer.py         | 127 ++++++++++--------
 3 files changed, 100 insertions(+), 78 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 619f7d2858..d247cd720d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@ All notable changes to this project will be documented in this file.
 - Added `supported_entity` parameter to `PhoneRecognizer`. Previously, this recognizer hard-coded `["PHONE_NUMBER"]` as the only possible supported entity.
 
 #### Fixed
+- Added Luhn mod-36 checksum validation to the Indian GSTIN recognizer, reducing false positives for structurally valid but mistyped GSTINs.
 - Fixed incorrect Prüfziffer algorithm in `DeHealthInsuranceRecognizer` (KVNR); now uses alternating factors [1,2,…,1,2] per § 290 SGB V Anlage 1 (#1972).
 - Fixed incorrect check-digit weights in `DeSocialSecurityRecognizer` (RVNR); now uses VKVV § 4 weights [2,1,2,5,7,1,2,1,2,1,2,1]. Previous weights diverged from the Deutsche Rentenversicherung specification and rejected the canonical DRV example 15070649C103.
 - Fixed incorrect check-digit algorithm in `DeLanrRecognizer`; now uses KBV Arztnummern-Richtlinie weights [4,9,4,9,4,9] without the spurious Quersumme step, and the complement-to-10 formula `(10 − sum mod 10) mod 10`. Previous weights and formula were internally self-consistent only.
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py
index ef965c9322..63f5338af1 100644
--- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py
+++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py
@@ -1,3 +1,4 @@
+import re
 from typing import List, Optional, Tuple
 
 from presidio_analyzer import Pattern, PatternRecognizer
@@ -28,10 +29,16 @@ class InGstinRecognizer(PatternRecognizer):
 
     COUNTRY_CODE = "in"
 
+    GSTIN_CHARSET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    GSTIN_PATTERN = re.compile(
+        r"^(?:0[1-9]|[1-3][0-7])[A-Z]{5}[0-9]{4}[A-Z][1-9A-Z]Z[0-9A-Z]$"
+    )
+
     PATTERNS = [
         Pattern(
             "GSTIN (High)",
-            r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z0-9]{10}[A-Za-z0-9]{1}Z[A-Za-z0-9]{1})\b",
+            r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z]{5}[0-9]{4}"
+            r"[A-Za-z][1-9A-Za-z]Z[A-Za-z0-9])\b",
             0.8,
         ),
         Pattern(
@@ -90,12 +97,10 @@ def validate_result(self, pattern_text: str) -> bool:
 
     def _sanitize_value(self, text: str) -> str:
         """Remove common separators and normalize the text."""
-        import re
-
         # First, try to extract GSTIN pattern from the text
         gstin_pattern = (
             r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z]{5}[0-9]{4}[A-Za-z]{1}"
-            r"[0-9A-Za-z]{1}Z[0-9A-Za-z]{1})\b"
+            r"[1-9A-Za-z]{1}Z[0-9A-Za-z]{1})\b"
         )
         match = re.search(gstin_pattern, text.upper())
         if match:
@@ -122,26 +127,10 @@ def _validate_gstin(self, gstin: str) -> bool:
         if not state_code.isdigit() or not (1 <= int(state_code) <= 37):
             return False
 
-        # Check PAN format (characters 3-12)
-        pan_part = gstin[2:12]
-        if not self._validate_pan_format(pan_part):
-            return False
-
-        # Check 13th character (registration number)
-        reg_number = gstin[12]
-        if not reg_number.isalnum():
+        if not self.GSTIN_PATTERN.fullmatch(gstin):
             return False
 
-        # Check 14th character should be 'Z'
-        if gstin[13] != "Z":
-            return False
-
-        # Check 15th character (checksum)
-        checksum = gstin[14]
-        if not checksum.isalnum():
-            return False
-
-        return True
+        return self._validate_checksum(gstin)
 
     def _validate_pan_format(self, pan: str) -> bool:
         """
@@ -172,3 +161,20 @@ def _validate_pan_format(self, pan: str) -> bool:
             return False
 
         return True
+
+    def _validate_checksum(self, gstin: str) -> bool:
+        """
+        Validate the GSTIN check digit using the Luhn mod-36 algorithm.
+
+        The first 14 characters are transformed into base-36 values and
+        multiplied by alternating factors of 1 and 2. The GSTIN is valid only
+        when the calculated check character matches the 15th character.
+        """
+        total = 0
+        for index, char in enumerate(gstin[:14]):
+            codepoint = self.GSTIN_CHARSET.index(char)
+            product = codepoint * ((index % 2) + 1)
+            total += (product // 36) + (product % 36)
+
+        check_codepoint = (36 - (total % 36)) % 36
+        return self.GSTIN_CHARSET[check_codepoint] == gstin[14]
diff --git a/presidio-analyzer/tests/test_in_gstin_recognizer.py b/presidio-analyzer/tests/test_in_gstin_recognizer.py
index 5a32a36eee..be2f53ef94 100644
--- a/presidio-analyzer/tests/test_in_gstin_recognizer.py
+++ b/presidio-analyzer/tests/test_in_gstin_recognizer.py
@@ -1,7 +1,9 @@
 import pytest
 
 from tests import assert_result
-from presidio_analyzer.predefined_recognizers.country_specific.india.in_gstin_recognizer import InGstinRecognizer
+from presidio_analyzer.predefined_recognizers.country_specific.india.in_gstin_recognizer import (
+    InGstinRecognizer,
+)
 
 
 @pytest.fixture(scope="module")
@@ -18,33 +20,29 @@ def entities():
     "text, expected_len, expected_position, expected_score",
     [
         # Valid GSTINs with high confidence
-        ("27ABCDE1234F1Z5", 1, (0, 15), 1.0),
-        ("07PQRST6789K1Z2", 1, (0, 15), 1.0),
-        ("01ABCDE1234F1Z5", 1, (0, 15), 1.0),
-        ("37ABCDE1234F1Z5", 1, (0, 15), 1.0),
-        
-        # Valid GSTINs with medium confidence (different PAN format)
-        ("27ABCDE1234F1Z5", 1, (0, 15), 1.0),
-        ("07PQRST6789K1Z2", 1, (0, 15), 1.0),
-        
-        # Valid GSTINs with low confidence (generic pattern)
-        ("27ABCDE1234F1Z5", 1, (0, 15), 1.0),
-        
+        ("27AAPFU0939F1ZV", 1, (0, 15), 1.0),
+        ("07AAACR5055K1Z5", 1, (0, 15), 1.0),
+        ("29AAGCB7383J1Z4", 1, (0, 15), 1.0),
+        ("27AASCS2460H1Z0", 1, (0, 15), 1.0),
         # GSTIN with context
-        ("My GSTIN number is 27ABCDE1234F1Z5 for business registration", 1, (19, 34), 1.0),
-        ("GST registration: 07PQRST6789K1Z2", 1, (18, 33), 1.0),
-        ("Tax identification GSTIN: 01ABCDE1234F1Z5", 1, (26, 41), 1.0),
-        
+        (
+            "My GSTIN number is 27AAPFU0939F1ZV for business registration",
+            1,
+            (19, 34),
+            1.0,
+        ),
+        ("GST registration: 07AAACR5055K1Z5", 1, (18, 33), 1.0),
+        ("Tax identification GSTIN: 29AAGCB7383J1Z4", 1, (26, 41), 1.0),
         # Multiple GSTINs
-        ("GSTINs: 27ABCDE1234F1Z5 and 07PQRST6789K1Z2", 2, (8, 23), 1.0),
-        
+        ("GSTINs: 27AAPFU0939F1ZV and 07AAACR5055K1Z5", 2, (8, 23), 1.0),
         # Invalid GSTINs (should not be detected)
-        ("27ABCDE1234F1Z", 0, (), ()),  # Too short
-        ("27ABCDE1234F1Z55", 0, (), ()),  # Too long
-        ("00ABCDE1234F1Z5", 0, (), ()),  # Invalid state code
-        ("38ABCDE1234F1Z5", 0, (), ()),  # Invalid state code
-        ("27ABCDE1234F1Y5", 0, (), ()),  # Missing 'Z' at position 14
-        ("27ABCDE1234F1Z", 0, (), ()),  # Missing checksum
+        ("27AAPFU0939F1Z", 0, (), ()),  # Too short
+        ("27AAPFU0939F1ZVV", 0, (), ()),  # Too long
+        ("00AAPFU0939F1ZV", 0, (), ()),  # Invalid state code
+        ("38AAPFU0939F1ZV", 0, (), ()),  # Invalid state code
+        ("27AAPFU0939F1YV", 0, (), ()),  # Missing 'Z' at position 14
+        ("27AAPFU0939F1ZU", 0, (), ()),  # Invalid checksum
+        ("27AAPFU0939F0ZV", 0, (), ()),  # Invalid registration character
     ],
 )
 def test_when_gstin_in_text_then_all_gstins_found(
@@ -75,13 +73,15 @@ def test_when_gstin_in_text_then_all_gstins_found(
         ("", 0),
         ("123456789012345", 0),  # All digits
         ("ABCDEFGHIJKLMNO", 0),  # All letters
-        ("27ABCDE1234F1Z", 0),   # Too short
-        ("27ABCDE1234F1Z55", 0), # Too long
-        ("00ABCDE1234F1Z5", 0),  # Invalid state code (00)
-        ("38ABCDE1234F1Z5", 0),  # Invalid state code (38)
-        ("27ABCDE1234F1Y5", 0),  # Missing 'Z' at position 14
-        ("27ABCDE1234F1Z", 0),   # Missing checksum
-        ("27ABCDE1234F1Z5", 1),  # Valid GSTIN
+        ("27AAPFU0939F1Z", 0),  # Too short
+        ("27AAPFU0939F1ZVV", 0),  # Too long
+        ("00AAPFU0939F1ZV", 0),  # Invalid state code (00)
+        ("38AAPFU0939F1ZV", 0),  # Invalid state code (38)
+        ("27AAPFU0939F1YV", 0),  # Missing 'Z' at position 14
+        ("27AAPFU0939F1Z", 0),  # Missing checksum
+        ("27AAPFU0939F1ZU", 0),  # Wrong checksum
+        ("27AAPFU0939F0ZV", 0),  # Invalid registration character
+        ("27AAPFU0939F1ZV", 1),  # Valid GSTIN
     ],
 )
 def test_gstin_validation(text, expected_len, recognizer, entities):
@@ -94,20 +94,20 @@ def test_gstin_validation(text, expected_len, recognizer, entities):
     "gstin, expected",
     [
         # Valid GSTINs
-        ("27ABCDE1234F1Z5", True),
-        ("07PQRST6789K1Z2", True),
-        ("01ABCDE1234F1Z5", True),
-        ("37ABCDE1234F1Z5", True),
-        
+        ("27AAPFU0939F1ZV", True),
+        ("07AAACR5055K1Z5", True),
+        ("29AAGCB7383J1Z4", True),
+        ("27AASCS2460H1Z0", True),
+        ("27aapfu0939f1zv", True),  # Valid with different case
         # Invalid GSTINs
-        ("27ABCDE1234F1Z", False),   # Too short
-        ("27ABCDE1234F1Z55", False), # Too long
-        ("00ABCDE1234F1Z5", False),  # Invalid state code
-        ("38ABCDE1234F1Z5", False),  # Invalid state code
-        ("27ABCDE1234F1Y5", False),  # Missing 'Z' at position 14
-        ("27ABCDE1234F1Z", False),   # Missing checksum
-        ("27ABCDE1234F1Z5", True),   # Valid
-        ("27ABCDE1234F1Z5", True),   # Valid with different case
+        ("27AAPFU0939F1Z", False),  # Too short
+        ("27AAPFU0939F1ZVV", False),  # Too long
+        ("00AAPFU0939F1ZV", False),  # Invalid state code
+        ("38AAPFU0939F1ZV", False),  # Invalid state code
+        ("27AAPFU0939F1YV", False),  # Missing 'Z' at position 14
+        ("27AAPFU0939F1Z", False),  # Missing checksum
+        ("27AAPFU0939F1ZU", False),  # Wrong checksum
+        ("27AAPFU0939F0ZV", False),  # Invalid registration character
     ],
 )
 def test_validate_result(gstin, expected, recognizer):
@@ -141,11 +141,14 @@ def test_validate_pan_format(pan, expected, recognizer):
 @pytest.mark.parametrize(
     "text, expected",
     [
-        ("27ABCDE1234F1Z5", "27ABCDE1234F1Z5"),
-        ("27ABCDE1234F1Z5", "27ABCDE1234F1Z5"),
-        ("27-ABCDE-1234-F1-Z5", "27ABCDE1234F1Z5"),
-        ("27 ABCDE 1234 F1 Z5", "27ABCDE1234F1Z5"),
-        ("The company GSTIN is 27ABCDE1234F1Z5 for tax purposes", "27ABCDE1234F1Z5"),
+        ("27AAPFU0939F1ZV", "27AAPFU0939F1ZV"),
+        ("27aapfu0939f1zv", "27AAPFU0939F1ZV"),
+        ("27-AAPFU-0939-F1-ZV", "27AAPFU0939F1ZV"),
+        ("27 AAPFU 0939 F1 ZV", "27AAPFU0939F1ZV"),
+        (
+            "The company GSTIN is 27AAPFU0939F1ZV for tax purposes",
+            "27AAPFU0939F1ZV",
+        ),
     ],
 )
 def test_sanitize_value(text, expected, recognizer):
@@ -170,9 +173,7 @@ def test_gstin_recognizer_with_custom_params():
     """Test GSTIN recognizer initialization with custom parameters."""
     custom_context = ["custom", "context"]
     recognizer = InGstinRecognizer(
-        context=custom_context,
-        supported_language="hi",
-        supported_entity="CUSTOM_GSTIN"
+        context=custom_context, supported_language="hi", supported_entity="CUSTOM_GSTIN"
     )
     
     assert recognizer.supported_entity == "CUSTOM_GSTIN"
@@ -186,7 +187,21 @@ def test_gstin_recognizer_replacement_pairs():
     recognizer = InGstinRecognizer(replacement_pairs=custom_replacement_pairs)
     
     assert recognizer.replacement_pairs == custom_replacement_pairs
-    
+
     # Test sanitization with custom replacement pairs
-    result = recognizer._sanitize_value("27-ABCDE-1234-F1-Z5")
-    assert result == "27ABCDE1234F1Z5"
+    result = recognizer._sanitize_value("27-AAPFU-0939-F1-ZV")
+    assert result == "27AAPFU0939F1ZV"
+
+
+@pytest.mark.parametrize(
+    "gstin, expected",
+    [
+        ("27AAPFU0939F1ZV", True),
+        ("27AAPFU0939F1ZU", False),
+        ("07AAACR5055K1Z5", True),
+        ("07AAACR5055K1Z4", False),
+    ],
+)
+def test_validate_checksum(gstin, expected, recognizer):
+    """Test GSTIN Luhn mod-36 checksum validation."""
+    assert recognizer._validate_checksum(gstin) == expected