diff --git a/pyreadstat/_readstat_parser.pyx b/pyreadstat/_readstat_parser.pyx index 00be028..12dd1e3 100644 --- a/pyreadstat/_readstat_parser.pyx +++ b/pyreadstat/_readstat_parser.pyx @@ -29,6 +29,7 @@ import datetime import os import warnings import sys +import re import narwhals.stable.v2 as nw import numpy as np @@ -42,27 +43,27 @@ import_datetime() cdef object unix_origin = datetime_new(1970, 1, 1, 0, 0, 0, 0, None) -cdef list sas_date_formats = ["WEEKDATE", "MMDDYY", "DDMMYY", "YYMMDD", "DATE", "DATE9", "YYMMDD10", - "DDMMYYB", "DDMMYYB10", "DDMMYYC", "DDMMYYC10", "DDMMYYD", "DDMMYYD10", - "DDMMYYN6", "DDMMYYN8", "DDMMYYP", "DDMMYYP10", "DDMMYYS", "DDMMYYS10", - "MMDDYYB", "MMDDYYB10", "MMDDYYC", "MMDDYYC10", "MMDDYYD", "MMDDYYD10", - "MMDDYYN6", "MMDDYYN8", "MMDDYYP", "MMDDYYP10", "MMDDYYS", "MMDDYYS10", +cdef object format_regex = re.compile(r"^([A-Z][A-Z0-9]+[A-Z])(\d+)?(?(2)(?:\.\d+)?$|$)") +cdef list sas_date_formats = ["WEEKDATE", "MMDDYY", "DDMMYY", "YYMMDD", "DATE", "YYMMDD", + "DDMMYYB", "DDMMYYC", "DDMMYYD", + "DDMMYYN", "DDMMYYP", "DDMMYYS", + "MMDDYYB", "MMDDYYC", "MMDDYYD", + "MMDDYYN", "MMDDYYP", "MMDDYYS", #"MONNAME", "MONTH", "WEEKDAY", "QTR", "QTRR", "YEAR","DAY", "DOWNAME" # these do not print as full dates in sas "WEEKDATX", "DTDATE", "IS8601DA", "E8601DA", "B8601DA", - "YYMMDDB", "YYMMDDD", "YYMMDDN", "YYMMDDP", "YYMMDDS",] -cdef list sas_datetime_formats = ["DATETIME", "DATETIME18", "DATETIME19", "DATETIME20", "DATETIME21", "DATETIME22", - "E8601DT", "DATEAMPM", "MDYAMPM", "IS8601DT", "B8601DT", "B8601DN"] -cdef list sas_time_formats = ["TIME", "HHMM", "TIME20.3", "TIME20", "TIME5", "TOD", "TIMEAMPM", "IS8601TM", "E8601TM", "B8601TM", ] + "YYMMDDB", "YYMMDDD", "YYMMDDN", "YYMMDDP", "YYMMDDS"] +cdef list sas_datetime_formats = ["DATETIME", "E8601DT", "DATEAMPM", "MDYAMPM", "IS8601DT", "B8601DT", "B8601DN"] +cdef list sas_time_formats = ["TIME", "HHMM", "TOD", "TIMEAMPM", "IS8601TM", "E8601TM", "B8601TM"] # "HOUR" # these do not print as full time formats in sas #cdef list sas_all_formats = sas_date_formats + sas_datetime_formats + sas_time_formats cdef list sas_all_formats cdef object sas_origin = datetime_new(1960, 1, 1, 0, 0, 0, 0, None) cdef object sas_secs_from_unix = total_seconds(unix_origin - sas_origin) -cdef list spss_datetime_formats = ["DATETIME", "DATETIME8", 'DATETIME17', 'DATETIME20', 'DATETIME23.2',"YMDHMS16","YMDHMS19","YMDHMS19.2", "YMDHMS20"] -cdef list spss_date_formats = ["DATE",'DATE8','DATE11', 'DATE12', "ADATE","ADATE8", "ADATE10", "EDATE", 'EDATE8','EDATE10', "JDATE", "JDATE5", "JDATE7", "SDATE", "SDATE8", "SDATE10",] -cdef list spss_time_formats = ["TIME", "DTIME", 'TIME8', 'TIME5', 'TIME11.2'] +cdef list spss_datetime_formats = ["DATETIME", "YMDHMS"] +cdef list spss_date_formats = ["DATE", "ADATE", "EDATE", "JDATE", "SDATE"] +cdef list spss_time_formats = ["TIME", "DTIME"] #cdef list spss_all_formats = spss_date_formats + spss_datetime_formats + spss_time_formats cdef list spss_all_formats cdef object spss_origin = datetime_new(1582, 10, 14, 0, 0, 0, 0, None) @@ -148,26 +149,32 @@ cdef py_datetime_format transform_variable_format(str var_format, py_file_format Transforms a readstat var_format to a date, datetime or time format label """ if file_format == FILE_FORMAT_SAS: - if var_format in sas_all_formats: - if var_format in sas_date_formats: - return DATE_FORMAT_DATE - elif var_format in sas_datetime_formats: - return DATE_FORMAT_DATETIME - elif var_format in sas_time_formats: - return DATE_FORMAT_TIME - else: - return DATE_FORMAT_NOTADATE + if var_format: + format_match = format_regex.match(var_format) + if format_match: + var_format_name = format_match.group(1) + if var_format_name in sas_all_formats: + if var_format_name in sas_date_formats: + return DATE_FORMAT_DATE + elif var_format_name in sas_datetime_formats: + return DATE_FORMAT_DATETIME + elif var_format_name in sas_time_formats: + return DATE_FORMAT_TIME + return DATE_FORMAT_NOTADATE elif file_format == FILE_FORMAT_SPSS: - if var_format in spss_all_formats: - if var_format in spss_date_formats: - return DATE_FORMAT_DATE - elif var_format in spss_datetime_formats: - return DATE_FORMAT_DATETIME - elif var_format in spss_time_formats: - return DATE_FORMAT_TIME - else: - return DATE_FORMAT_NOTADATE + if var_format: + format_match = format_regex.match(var_format) + if format_match: + var_format_name = format_match.group(1) + if var_format_name in spss_all_formats: + if var_format_name in spss_date_formats: + return DATE_FORMAT_DATE + elif var_format_name in spss_datetime_formats: + return DATE_FORMAT_DATETIME + elif var_format_name in spss_time_formats: + return DATE_FORMAT_TIME + return DATE_FORMAT_NOTADATE elif file_format == FILE_FORMAT_STATA: if var_format in stata_all_formats: @@ -1267,27 +1274,27 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_ if extra_date_formats is not None: if file_format == FILE_FORMAT_SAS: - sas_date_formats.extend(extra_date_formats) + sas_date_formats.extend([format_regex.match(edf).group(1) for edf in extra_date_formats if format_regex.match(edf)]) elif file_format == FILE_FORMAT_SPSS: - spss_date_formats.extend(extra_date_formats) + spss_date_formats.extend([format_regex.match(edf).group(1) for edf in extra_date_formats if format_regex.match(edf)]) elif file_format == FILE_FORMAT_STATA: stata_date_formats.extend(extra_date_formats) else: raise PyreadstatError("Unknown file format") if extra_datetime_formats is not None: if file_format == FILE_FORMAT_SAS: - sas_datetime_formats.extend(extra_datetime_formats) + sas_datetime_formats.extend([format_regex.match(edtf).group(1) for edtf in extra_datetime_formats if format_regex.match(edtf)]) elif file_format == FILE_FORMAT_SPSS: - spss_datetime_formats.extend(extra_datetime_formats) + spss_datetime_formats.extend([format_regex.match(edtf).group(1) for edtf in extra_datetime_formats if format_regex.match(edtf)]) elif file_format == FILE_FORMAT_STATA: stata_datetime_formats.extend(extra_datetime_formats) else: raise PyreadstatError("Unknown file format") if extra_time_formats is not None: if file_format == FILE_FORMAT_SAS: - sas_time_formats.extend(extra_time_formats) + sas_time_formats.extend([format_regex.match(etf).group(1) for etf in extra_time_formats if format_regex.match(etf)]) elif file_format == FILE_FORMAT_SPSS: - spss_time_formats.extend(extra_time_formats) + spss_time_formats.extend([format_regex.match(etf).group(1) for etf in extra_time_formats if format_regex.match(etf)]) elif file_format == FILE_FORMAT_STATA: stata_time_formats.extend(extra_time_formats) else: