Python/data_structures/pearson_correlation.py at 28e43e7c9313d591d9e7c888c215a6ea2b56d4ec · saurav856/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Pearson Correlation Coefficient: Measures the linear relationship between two
variables. The result is a value between -1 and 1, where:
    1  = perfect positive correlation
    0  = no correlation
   -1  = perfect negative correlation

It is widely used in data analysis, statistics, and machine learning to
understand relationships between features in a dataset.

Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
"""


def pearson_correlation(x: list[float], y: list[float]) -> float:
    """
    Calculate the Pearson Correlation Coefficient between two lists.

    Parameters
    ----------
    x: list[float], first list of numbers
    y: list[float], second list of numbers

    Returns
    -------
    float: Pearson correlation coefficient between -1 and 1

    >>> pearson_correlation([1, 2, 3, 4, 5], [1, 2, 3, 4, 5])
    1.0
    >>> pearson_correlation([1, 2, 3, 4, 5], [5, 4, 3, 2, 1])
    -1.0
    >>> pearson_correlation([1, 2, 3], [4, 5, 6])
    1.0
    >>> round(pearson_correlation([1, 2, 3, 4], [1, 2, 1, 2]), 4)
    0.4472
    >>> pearson_correlation([], [1, 2, 3])
    Traceback (most recent call last):
        ...
    ValueError: lists must not be empty
    >>> pearson_correlation([1, 2, 3], [1, 2])
    Traceback (most recent call last):
        ...
    ValueError: lists must have the same length
    >>> pearson_correlation([1, 1, 1], [2, 2, 2])
    Traceback (most recent call last):
        ...
    ValueError: standard deviation of x or y is zero
    """
    if not x or not y:
        raise ValueError("lists must not be empty")
    if len(x) != len(y):
        raise ValueError("lists must have the same length")

    n = len(x)
    mean_x = sum(x) / n
    mean_y = sum(y) / n

    numerator = sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(n))
    std_x = sum((xi - mean_x) ** 2 for xi in x) ** 0.5
    std_y = sum((yi - mean_y) ** 2 for yi in y) ** 0.5

    if std_x == 0 or std_y == 0:
        raise ValueError("standard deviation of x or y is zero")

    return round(numerator / (std_x * std_y), 10)


if __name__ == "__main__":
    import doctest

    doctest.testmod()

    x = [1, 2, 3, 4, 5]
    y = [2, 4, 5, 4, 5]
    print(f"x: {x}")
    print(f"y: {y}")
    print(f"Pearson correlation: {pearson_correlation(x, y)}")