From 5ad7dc732768645f87f5defcaf68112f3cdf5521 Mon Sep 17 00:00:00 2001
From: RAJVEER42 <irajveer.bishnoi2310@gmail.com>
Date: Sat, 23 May 2026 16:28:51 +0530
Subject: [PATCH] fix(safaa): handle non-list iterables in
 _ensure_list_of_strings

_ensure_list_of_strings called data.to_list() when the input was not a
list. to_list is a pandas Series method, so tuples, generators, sets and
other iterables raised AttributeError despite the docstring stating the
parameter accepts an "iterable".

Replace with list(data), which works uniformly for lists, tuples,
generators, pandas Series, numpy arrays, and any other iterable.

Also introduce a pytest-based test suite (the repo previously had none),
add pytest to the dev dependencies, and cover the regression path along
with baseline tests for predict() and declutter() so future refactors
have a safety net.

Signed-off-by: RAJVEER42 <irajveer.bishnoi2310@gmail.com>
---
 Safaa/src/safaa/Safaa.py |   4 +-
 pyproject.toml           |   1 +
 tests/__init__.py        |   3 +
 tests/test_safaa.py      | 170 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_safaa.py

diff --git a/Safaa/src/safaa/Safaa.py b/Safaa/src/safaa/Safaa.py
index cfa4cd5..aa3e143 100644
--- a/Safaa/src/safaa/Safaa.py
+++ b/Safaa/src/safaa/Safaa.py
@@ -142,9 +142,9 @@ def _ensure_list_of_strings(self, data):
         list: A list of strings.
         """
 
-        # If data is not a list, try converting it to a list
+        # If data is not a list, convert it to a list
         if not isinstance(data, list):
-            data = data.to_list()
+            data = list(data)
         # Ensure each item in the list is a string
         return [str(item) for item in data]
 
diff --git a/pyproject.toml b/pyproject.toml
index 830a55d..8e29e89 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ psycopg2-binary = '>=2.9'
 requests = '>=2.28'
 flake8 = '*'
 build = '*'
+pytest = '*'
 
 [project]
 name = 'safaa'
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..d31f995
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: © 2026 RAJVEER42 <irajveer.bishnoi2310@gmail.com>
+#
+# SPDX-License-Identifier: LGPL-2.1-only
diff --git a/tests/test_safaa.py b/tests/test_safaa.py
new file mode 100644
index 0000000..67d6030
--- /dev/null
+++ b/tests/test_safaa.py
@@ -0,0 +1,170 @@
+# SPDX-FileCopyrightText: © 2026 RAJVEER42 <irajveer.bishnoi2310@gmail.com>
+#
+# SPDX-License-Identifier: LGPL-2.1-only
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from safaa.Safaa import SafaaAgent
+
+
+@pytest.fixture(scope="module")
+def agent():
+    return SafaaAgent()
+
+
+# ---------------------------------------------------------------------------
+# _ensure_list_of_strings
+# ---------------------------------------------------------------------------
+
+class TestEnsureListOfStrings:
+
+    def test_list_of_strings_unchanged(self, agent):
+        data = ["Copyright 2024 Foo Inc.", "src/lib/tests"]
+        result = agent._ensure_list_of_strings(data)
+        assert result == ["Copyright 2024 Foo Inc.", "src/lib/tests"]
+
+    def test_tuple_converted_to_list(self, agent):
+        # tuples are a common Python iterable — must not raise AttributeError
+        result = agent._ensure_list_of_strings(("Copyright 2024 Foo", "bar"))
+        assert result == ["Copyright 2024 Foo", "bar"]
+
+    def test_generator_consumed_to_list(self, agent):
+        # generators have no .to_list(); this was broken before the fix
+        gen = (s for s in ["Copyright 2024 A", "Copyright 2024 B"])
+        result = agent._ensure_list_of_strings(gen)
+        assert result == ["Copyright 2024 A", "Copyright 2024 B"]
+
+    def test_pandas_series_converted(self, agent):
+        series = pd.Series(["Copyright 2024 Foo", "noise string"])
+        result = agent._ensure_list_of_strings(series)
+        assert result == ["Copyright 2024 Foo", "noise string"]
+
+    def test_numpy_array_converted(self, agent):
+        arr = np.array(["Copyright 2024 Foo", "noise"])
+        result = agent._ensure_list_of_strings(arr)
+        assert result == ["Copyright 2024 Foo", "noise"]
+
+    def test_list_with_integers_coerced_to_strings(self, agent):
+        result = agent._ensure_list_of_strings([2024, 42])
+        assert result == ["2024", "42"]
+
+    def test_list_with_none_coerced_to_string(self, agent):
+        result = agent._ensure_list_of_strings([None, "Copyright 2024 Foo"])
+        assert result == ["None", "Copyright 2024 Foo"]
+
+    def test_list_with_mixed_types(self, agent):
+        result = agent._ensure_list_of_strings([1, None, "hello", 3.14])
+        assert result == ["1", "None", "hello", "3.14"]
+
+    def test_empty_list(self, agent):
+        assert agent._ensure_list_of_strings([]) == []
+
+    def test_empty_tuple(self, agent):
+        assert agent._ensure_list_of_strings(()) == []
+
+    def test_empty_generator(self, agent):
+        assert agent._ensure_list_of_strings(x for x in []) == []
+
+    def test_single_element_list(self, agent):
+        assert agent._ensure_list_of_strings(["only one"]) == ["only one"]
+
+    def test_single_element_tuple(self, agent):
+        assert agent._ensure_list_of_strings(("only one",)) == ["only one"]
+
+    def test_output_is_always_list(self, agent):
+        # Regardless of input type, output must always be a plain list
+        for input_data in [("a",), pd.Series(["a"]), np.array(["a"])]:
+            result = agent._ensure_list_of_strings(input_data)
+            assert type(result) is list
+
+    def test_whitespace_strings_preserved(self, agent):
+        result = agent._ensure_list_of_strings(["  ", "\t"])
+        assert result == ["  ", "\t"]
+
+
+# ---------------------------------------------------------------------------
+# predict — baseline smoke tests (broader coverage added in subsequent PRs)
+# ---------------------------------------------------------------------------
+
+class TestPredict:
+
+    TRUE_POSITIVES = [
+        "Copyright 2024 Siemens AG",
+        "Copyright (C) 2019 Red Hat, Inc.",
+        "Copyright 2020 Google LLC",
+    ]
+    FALSE_POSITIVES = [
+        "src/lib/c/tests/testlibs",
+    ]
+
+    def test_known_true_copyright_predicts_t(self, agent):
+        for sample in self.TRUE_POSITIVES:
+            result = agent.predict([sample])
+            assert result == ["t"], f"Expected 't' for {sample!r}, got {result}"
+
+    def test_known_false_positive_predicts_f(self, agent):
+        for sample in self.FALSE_POSITIVES:
+            result = agent.predict([sample])
+            assert result == ["f"], f"Expected 'f' for {sample!r}, got {result}"
+
+    def test_output_length_matches_input(self, agent):
+        data = self.TRUE_POSITIVES + self.FALSE_POSITIVES
+        result = agent.predict(data)
+        assert len(result) == len(data)
+
+    def test_output_values_only_t_or_f(self, agent):
+        data = self.TRUE_POSITIVES + self.FALSE_POSITIVES
+        result = agent.predict(data)
+        assert all(v in ("t", "f") for v in result)
+
+    def test_accepts_tuple_input(self, agent):
+        # Enabled by the _ensure_list_of_strings fix in this PR
+        result = agent.predict(tuple(self.TRUE_POSITIVES))
+        assert len(result) == len(self.TRUE_POSITIVES)
+
+    def test_accepts_generator_input(self, agent):
+        # Enabled by the _ensure_list_of_strings fix in this PR
+        result = agent.predict(s for s in self.TRUE_POSITIVES)
+        assert len(result) == len(self.TRUE_POSITIVES)
+
+    def test_accepts_pandas_series_input(self, agent):
+        series = pd.Series(self.TRUE_POSITIVES + self.FALSE_POSITIVES)
+        result = agent.predict(series)
+        assert len(result) == len(series)
+
+
+# ---------------------------------------------------------------------------
+# declutter — baseline smoke tests
+# ---------------------------------------------------------------------------
+
+class TestDeclutter:
+
+    def test_false_positive_becomes_empty_string(self, agent):
+        result = agent.declutter(["src/lib/c/tests/testlibs"], ["f"])
+        assert result == [""]
+
+    def test_true_positive_returns_nonempty(self, agent):
+        result = agent.declutter(["Copyright 2024 Siemens AG"], ["t"])
+        assert result[0] != ""
+
+    def test_output_length_matches_input(self, agent):
+        data = ["Copyright 2024 Foo", "noise", "Copyright 2020 Bar"]
+        preds = ["t", "f", "t"]
+        result = agent.declutter(data, preds)
+        assert len(result) == len(data)
+
+    def test_all_false_positives_all_empty(self, agent):
+        data = ["noise one", "noise two", "noise three"]
+        preds = ["f", "f", "f"]
+        result = agent.declutter(data, preds)
+        assert result == ["", "", ""]
+
+    def test_empty_input(self, agent):
+        assert agent.declutter([], []) == []
+
+    def test_prediction_f_always_wins(self, agent):
+        # Even a genuine copyright string should be blanked if predicted false
+        result = agent.declutter(["Copyright 2024 Siemens AG"], ["f"])
+        assert result == [""]