Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ All notable changes to this project will be documented in this file.
- Added `supported_entity` parameter to `PhoneRecognizer`. Previously, this recognizer hard-coded `["PHONE_NUMBER"]` as the only possible supported entity.

#### Fixed
- Added Luhn mod-36 checksum validation to the Indian GSTIN recognizer, reducing false positives for structurally valid but mistyped GSTINs.
- Fixed incorrect Prüfziffer algorithm in `DeHealthInsuranceRecognizer` (KVNR); now uses alternating factors [1,2,…,1,2] per § 290 SGB V Anlage 1 (#1972).
- Fixed incorrect check-digit weights in `DeSocialSecurityRecognizer` (RVNR); now uses VKVV § 4 weights [2,1,2,5,7,1,2,1,2,1,2,1]. Previous weights diverged from the Deutsche Rentenversicherung specification and rejected the canonical DRV example 15070649C103.
- Fixed incorrect check-digit algorithm in `DeLanrRecognizer`; now uses KBV Arztnummern-Richtlinie weights [4,9,4,9,4,9] without the spurious Quersumme step, and the complement-to-10 formula `(10 − sum mod 10) mod 10`. Previous weights and formula were internally self-consistent only.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
Expand Down Expand Up @@ -28,10 +29,16 @@ class InGstinRecognizer(PatternRecognizer):

COUNTRY_CODE = "in"

GSTIN_CHARSET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
GSTIN_PATTERN = re.compile(
r"^(?:0[1-9]|[1-3][0-7])[A-Z]{5}[0-9]{4}[A-Z][1-9A-Z]Z[0-9A-Z]$"
)

PATTERNS = [
Pattern(
"GSTIN (High)",
r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z0-9]{10}[A-Za-z0-9]{1}Z[A-Za-z0-9]{1})\b",
r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z]{5}[0-9]{4}"
r"[A-Za-z][1-9A-Za-z]Z[A-Za-z0-9])\b",
0.8,
),
Pattern(
Expand Down Expand Up @@ -90,12 +97,10 @@ def validate_result(self, pattern_text: str) -> bool:

def _sanitize_value(self, text: str) -> str:
"""Remove common separators and normalize the text."""
import re

# First, try to extract GSTIN pattern from the text
gstin_pattern = (
r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z]{5}[0-9]{4}[A-Za-z]{1}"
r"[0-9A-Za-z]{1}Z[0-9A-Za-z]{1})\b"
r"[1-9A-Za-z]{1}Z[0-9A-Za-z]{1})\b"
)
match = re.search(gstin_pattern, text.upper())
if match:
Expand All @@ -122,26 +127,10 @@ def _validate_gstin(self, gstin: str) -> bool:
if not state_code.isdigit() or not (1 <= int(state_code) <= 37):
return False

# Check PAN format (characters 3-12)
pan_part = gstin[2:12]
if not self._validate_pan_format(pan_part):
return False

# Check 13th character (registration number)
reg_number = gstin[12]
if not reg_number.isalnum():
if not self.GSTIN_PATTERN.fullmatch(gstin):
return False

# Check 14th character should be 'Z'
if gstin[13] != "Z":
return False

# Check 15th character (checksum)
checksum = gstin[14]
if not checksum.isalnum():
return False

return True
return self._validate_checksum(gstin)

def _validate_pan_format(self, pan: str) -> bool:
"""
Expand Down Expand Up @@ -172,3 +161,20 @@ def _validate_pan_format(self, pan: str) -> bool:
return False

return True

def _validate_checksum(self, gstin: str) -> bool:
"""
Validate the GSTIN check digit using the Luhn mod-36 algorithm.

The first 14 characters are transformed into base-36 values and
multiplied by alternating factors of 1 and 2. The GSTIN is valid only
when the calculated check character matches the 15th character.
"""
total = 0
for index, char in enumerate(gstin[:14]):
codepoint = self.GSTIN_CHARSET.index(char)
product = codepoint * ((index % 2) + 1)
total += (product // 36) + (product % 36)

check_codepoint = (36 - (total % 36)) % 36
return self.GSTIN_CHARSET[check_codepoint] == gstin[14]
127 changes: 71 additions & 56 deletions presidio-analyzer/tests/test_in_gstin_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers.country_specific.india.in_gstin_recognizer import InGstinRecognizer
from presidio_analyzer.predefined_recognizers.country_specific.india.in_gstin_recognizer import (
InGstinRecognizer,
)


@pytest.fixture(scope="module")
Expand All @@ -18,33 +20,29 @@ def entities():
"text, expected_len, expected_position, expected_score",
[
# Valid GSTINs with high confidence
("27ABCDE1234F1Z5", 1, (0, 15), 1.0),
("07PQRST6789K1Z2", 1, (0, 15), 1.0),
("01ABCDE1234F1Z5", 1, (0, 15), 1.0),
("37ABCDE1234F1Z5", 1, (0, 15), 1.0),

# Valid GSTINs with medium confidence (different PAN format)
("27ABCDE1234F1Z5", 1, (0, 15), 1.0),
("07PQRST6789K1Z2", 1, (0, 15), 1.0),

# Valid GSTINs with low confidence (generic pattern)
("27ABCDE1234F1Z5", 1, (0, 15), 1.0),

("27AAPFU0939F1ZV", 1, (0, 15), 1.0),
("07AAACR5055K1Z5", 1, (0, 15), 1.0),
("29AAGCB7383J1Z4", 1, (0, 15), 1.0),
("27AASCS2460H1Z0", 1, (0, 15), 1.0),
# GSTIN with context
("My GSTIN number is 27ABCDE1234F1Z5 for business registration", 1, (19, 34), 1.0),
("GST registration: 07PQRST6789K1Z2", 1, (18, 33), 1.0),
("Tax identification GSTIN: 01ABCDE1234F1Z5", 1, (26, 41), 1.0),

(
"My GSTIN number is 27AAPFU0939F1ZV for business registration",
1,
(19, 34),
1.0,
),
("GST registration: 07AAACR5055K1Z5", 1, (18, 33), 1.0),
("Tax identification GSTIN: 29AAGCB7383J1Z4", 1, (26, 41), 1.0),
# Multiple GSTINs
("GSTINs: 27ABCDE1234F1Z5 and 07PQRST6789K1Z2", 2, (8, 23), 1.0),

("GSTINs: 27AAPFU0939F1ZV and 07AAACR5055K1Z5", 2, (8, 23), 1.0),
# Invalid GSTINs (should not be detected)
("27ABCDE1234F1Z", 0, (), ()), # Too short
("27ABCDE1234F1Z55", 0, (), ()), # Too long
("00ABCDE1234F1Z5", 0, (), ()), # Invalid state code
("38ABCDE1234F1Z5", 0, (), ()), # Invalid state code
("27ABCDE1234F1Y5", 0, (), ()), # Missing 'Z' at position 14
("27ABCDE1234F1Z", 0, (), ()), # Missing checksum
("27AAPFU0939F1Z", 0, (), ()), # Too short
("27AAPFU0939F1ZVV", 0, (), ()), # Too long
("00AAPFU0939F1ZV", 0, (), ()), # Invalid state code
("38AAPFU0939F1ZV", 0, (), ()), # Invalid state code
("27AAPFU0939F1YV", 0, (), ()), # Missing 'Z' at position 14
("27AAPFU0939F1ZU", 0, (), ()), # Invalid checksum
("27AAPFU0939F0ZV", 0, (), ()), # Invalid registration character
],
)
def test_when_gstin_in_text_then_all_gstins_found(
Expand Down Expand Up @@ -75,13 +73,15 @@ def test_when_gstin_in_text_then_all_gstins_found(
("", 0),
("123456789012345", 0), # All digits
("ABCDEFGHIJKLMNO", 0), # All letters
("27ABCDE1234F1Z", 0), # Too short
("27ABCDE1234F1Z55", 0), # Too long
("00ABCDE1234F1Z5", 0), # Invalid state code (00)
("38ABCDE1234F1Z5", 0), # Invalid state code (38)
("27ABCDE1234F1Y5", 0), # Missing 'Z' at position 14
("27ABCDE1234F1Z", 0), # Missing checksum
("27ABCDE1234F1Z5", 1), # Valid GSTIN
("27AAPFU0939F1Z", 0), # Too short
("27AAPFU0939F1ZVV", 0), # Too long
("00AAPFU0939F1ZV", 0), # Invalid state code (00)
("38AAPFU0939F1ZV", 0), # Invalid state code (38)
("27AAPFU0939F1YV", 0), # Missing 'Z' at position 14
("27AAPFU0939F1Z", 0), # Missing checksum
("27AAPFU0939F1ZU", 0), # Wrong checksum
("27AAPFU0939F0ZV", 0), # Invalid registration character
("27AAPFU0939F1ZV", 1), # Valid GSTIN
],
)
def test_gstin_validation(text, expected_len, recognizer, entities):
Expand All @@ -94,20 +94,20 @@ def test_gstin_validation(text, expected_len, recognizer, entities):
"gstin, expected",
[
# Valid GSTINs
("27ABCDE1234F1Z5", True),
("07PQRST6789K1Z2", True),
("01ABCDE1234F1Z5", True),
("37ABCDE1234F1Z5", True),

("27AAPFU0939F1ZV", True),
("07AAACR5055K1Z5", True),
("29AAGCB7383J1Z4", True),
("27AASCS2460H1Z0", True),
("27aapfu0939f1zv", True), # Valid with different case
# Invalid GSTINs
("27ABCDE1234F1Z", False), # Too short
("27ABCDE1234F1Z55", False), # Too long
("00ABCDE1234F1Z5", False), # Invalid state code
("38ABCDE1234F1Z5", False), # Invalid state code
("27ABCDE1234F1Y5", False), # Missing 'Z' at position 14
("27ABCDE1234F1Z", False), # Missing checksum
("27ABCDE1234F1Z5", True), # Valid
("27ABCDE1234F1Z5", True), # Valid with different case
("27AAPFU0939F1Z", False), # Too short
("27AAPFU0939F1ZVV", False), # Too long
("00AAPFU0939F1ZV", False), # Invalid state code
("38AAPFU0939F1ZV", False), # Invalid state code
("27AAPFU0939F1YV", False), # Missing 'Z' at position 14
("27AAPFU0939F1Z", False), # Missing checksum
("27AAPFU0939F1ZU", False), # Wrong checksum
("27AAPFU0939F0ZV", False), # Invalid registration character
],
)
def test_validate_result(gstin, expected, recognizer):
Expand Down Expand Up @@ -141,11 +141,14 @@ def test_validate_pan_format(pan, expected, recognizer):
@pytest.mark.parametrize(
"text, expected",
[
("27ABCDE1234F1Z5", "27ABCDE1234F1Z5"),
("27ABCDE1234F1Z5", "27ABCDE1234F1Z5"),
("27-ABCDE-1234-F1-Z5", "27ABCDE1234F1Z5"),
("27 ABCDE 1234 F1 Z5", "27ABCDE1234F1Z5"),
("The company GSTIN is 27ABCDE1234F1Z5 for tax purposes", "27ABCDE1234F1Z5"),
("27AAPFU0939F1ZV", "27AAPFU0939F1ZV"),
("27aapfu0939f1zv", "27AAPFU0939F1ZV"),
("27-AAPFU-0939-F1-ZV", "27AAPFU0939F1ZV"),
("27 AAPFU 0939 F1 ZV", "27AAPFU0939F1ZV"),
(
"The company GSTIN is 27AAPFU0939F1ZV for tax purposes",
"27AAPFU0939F1ZV",
),
],
)
def test_sanitize_value(text, expected, recognizer):
Expand All @@ -170,9 +173,7 @@ def test_gstin_recognizer_with_custom_params():
"""Test GSTIN recognizer initialization with custom parameters."""
custom_context = ["custom", "context"]
recognizer = InGstinRecognizer(
context=custom_context,
supported_language="hi",
supported_entity="CUSTOM_GSTIN"
context=custom_context, supported_language="hi", supported_entity="CUSTOM_GSTIN"
)

assert recognizer.supported_entity == "CUSTOM_GSTIN"
Expand All @@ -186,7 +187,21 @@ def test_gstin_recognizer_replacement_pairs():
recognizer = InGstinRecognizer(replacement_pairs=custom_replacement_pairs)

assert recognizer.replacement_pairs == custom_replacement_pairs

# Test sanitization with custom replacement pairs
result = recognizer._sanitize_value("27-ABCDE-1234-F1-Z5")
assert result == "27ABCDE1234F1Z5"
result = recognizer._sanitize_value("27-AAPFU-0939-F1-ZV")
assert result == "27AAPFU0939F1ZV"


@pytest.mark.parametrize(
"gstin, expected",
[
("27AAPFU0939F1ZV", True),
("27AAPFU0939F1ZU", False),
("07AAACR5055K1Z5", True),
("07AAACR5055K1Z4", False),
],
)
def test_validate_checksum(gstin, expected, recognizer):
"""Test GSTIN Luhn mod-36 checksum validation."""
assert recognizer._validate_checksum(gstin) == expected
Loading