Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 43 additions & 36 deletions pyreadstat/_readstat_parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import datetime
import os
import warnings
import sys
import re

import narwhals.stable.v2 as nw
import numpy as np
Expand All @@ -42,27 +43,27 @@ import_datetime()

cdef object unix_origin = datetime_new(1970, 1, 1, 0, 0, 0, 0, None)

cdef list sas_date_formats = ["WEEKDATE", "MMDDYY", "DDMMYY", "YYMMDD", "DATE", "DATE9", "YYMMDD10",
"DDMMYYB", "DDMMYYB10", "DDMMYYC", "DDMMYYC10", "DDMMYYD", "DDMMYYD10",
"DDMMYYN6", "DDMMYYN8", "DDMMYYP", "DDMMYYP10", "DDMMYYS", "DDMMYYS10",
"MMDDYYB", "MMDDYYB10", "MMDDYYC", "MMDDYYC10", "MMDDYYD", "MMDDYYD10",
"MMDDYYN6", "MMDDYYN8", "MMDDYYP", "MMDDYYP10", "MMDDYYS", "MMDDYYS10",
cdef object format_regex = re.compile(r"^([A-Z][A-Z0-9]+[A-Z])(\d+)?(?(2)(?:\.\d+)?$|$)")
cdef list sas_date_formats = ["WEEKDATE", "MMDDYY", "DDMMYY", "YYMMDD", "DATE", "YYMMDD",
"DDMMYYB", "DDMMYYC", "DDMMYYD",
"DDMMYYN", "DDMMYYP", "DDMMYYS",
"MMDDYYB", "MMDDYYC", "MMDDYYD",
"MMDDYYN", "MMDDYYP", "MMDDYYS",
#"MONNAME", "MONTH", "WEEKDAY", "QTR", "QTRR", "YEAR","DAY", "DOWNAME" # these do not print as full dates in sas
"WEEKDATX", "DTDATE",
"IS8601DA", "E8601DA", "B8601DA",
"YYMMDDB", "YYMMDDD", "YYMMDDN", "YYMMDDP", "YYMMDDS",]
cdef list sas_datetime_formats = ["DATETIME", "DATETIME18", "DATETIME19", "DATETIME20", "DATETIME21", "DATETIME22",
"E8601DT", "DATEAMPM", "MDYAMPM", "IS8601DT", "B8601DT", "B8601DN"]
cdef list sas_time_formats = ["TIME", "HHMM", "TIME20.3", "TIME20", "TIME5", "TOD", "TIMEAMPM", "IS8601TM", "E8601TM", "B8601TM", ]
"YYMMDDB", "YYMMDDD", "YYMMDDN", "YYMMDDP", "YYMMDDS"]
cdef list sas_datetime_formats = ["DATETIME", "E8601DT", "DATEAMPM", "MDYAMPM", "IS8601DT", "B8601DT", "B8601DN"]
cdef list sas_time_formats = ["TIME", "HHMM", "TOD", "TIMEAMPM", "IS8601TM", "E8601TM", "B8601TM"]
# "HOUR" # these do not print as full time formats in sas
#cdef list sas_all_formats = sas_date_formats + sas_datetime_formats + sas_time_formats
cdef list sas_all_formats
cdef object sas_origin = datetime_new(1960, 1, 1, 0, 0, 0, 0, None)
cdef object sas_secs_from_unix = total_seconds(unix_origin - sas_origin)

cdef list spss_datetime_formats = ["DATETIME", "DATETIME8", 'DATETIME17', 'DATETIME20', 'DATETIME23.2',"YMDHMS16","YMDHMS19","YMDHMS19.2", "YMDHMS20"]
cdef list spss_date_formats = ["DATE",'DATE8','DATE11', 'DATE12', "ADATE","ADATE8", "ADATE10", "EDATE", 'EDATE8','EDATE10', "JDATE", "JDATE5", "JDATE7", "SDATE", "SDATE8", "SDATE10",]
cdef list spss_time_formats = ["TIME", "DTIME", 'TIME8', 'TIME5', 'TIME11.2']
cdef list spss_datetime_formats = ["DATETIME", "YMDHMS"]
cdef list spss_date_formats = ["DATE", "ADATE", "EDATE", "JDATE", "SDATE"]
cdef list spss_time_formats = ["TIME", "DTIME"]
#cdef list spss_all_formats = spss_date_formats + spss_datetime_formats + spss_time_formats
cdef list spss_all_formats
cdef object spss_origin = datetime_new(1582, 10, 14, 0, 0, 0, 0, None)
Expand Down Expand Up @@ -148,26 +149,32 @@ cdef py_datetime_format transform_variable_format(str var_format, py_file_format
Transforms a readstat var_format to a date, datetime or time format label
"""
if file_format == FILE_FORMAT_SAS:
if var_format in sas_all_formats:
if var_format in sas_date_formats:
return DATE_FORMAT_DATE
elif var_format in sas_datetime_formats:
return DATE_FORMAT_DATETIME
elif var_format in sas_time_formats:
return DATE_FORMAT_TIME
else:
return DATE_FORMAT_NOTADATE
if var_format:
format_match = format_regex.match(var_format)
if format_match:
var_format_name = format_match.group(1)
if var_format_name in sas_all_formats:
if var_format_name in sas_date_formats:
return DATE_FORMAT_DATE
elif var_format_name in sas_datetime_formats:
return DATE_FORMAT_DATETIME
elif var_format_name in sas_time_formats:
return DATE_FORMAT_TIME
return DATE_FORMAT_NOTADATE

elif file_format == FILE_FORMAT_SPSS:
if var_format in spss_all_formats:
if var_format in spss_date_formats:
return DATE_FORMAT_DATE
elif var_format in spss_datetime_formats:
return DATE_FORMAT_DATETIME
elif var_format in spss_time_formats:
return DATE_FORMAT_TIME
else:
return DATE_FORMAT_NOTADATE
if var_format:
format_match = format_regex.match(var_format)
if format_match:
var_format_name = format_match.group(1)
if var_format_name in spss_all_formats:
if var_format_name in spss_date_formats:
return DATE_FORMAT_DATE
elif var_format_name in spss_datetime_formats:
return DATE_FORMAT_DATETIME
elif var_format_name in spss_time_formats:
return DATE_FORMAT_TIME
return DATE_FORMAT_NOTADATE

elif file_format == FILE_FORMAT_STATA:
if var_format in stata_all_formats:
Expand Down Expand Up @@ -1267,27 +1274,27 @@ cdef object run_conversion(object filename_path, py_file_format file_format, py_

if extra_date_formats is not None:
if file_format == FILE_FORMAT_SAS:
sas_date_formats.extend(extra_date_formats)
sas_date_formats.extend([format_regex.match(edf).group(1) for edf in extra_date_formats if format_regex.match(edf)])
elif file_format == FILE_FORMAT_SPSS:
spss_date_formats.extend(extra_date_formats)
spss_date_formats.extend([format_regex.match(edf).group(1) for edf in extra_date_formats if format_regex.match(edf)])
elif file_format == FILE_FORMAT_STATA:
stata_date_formats.extend(extra_date_formats)
else:
raise PyreadstatError("Unknown file format")
if extra_datetime_formats is not None:
if file_format == FILE_FORMAT_SAS:
sas_datetime_formats.extend(extra_datetime_formats)
sas_datetime_formats.extend([format_regex.match(edtf).group(1) for edtf in extra_datetime_formats if format_regex.match(edtf)])
elif file_format == FILE_FORMAT_SPSS:
spss_datetime_formats.extend(extra_datetime_formats)
spss_datetime_formats.extend([format_regex.match(edtf).group(1) for edtf in extra_datetime_formats if format_regex.match(edtf)])
elif file_format == FILE_FORMAT_STATA:
stata_datetime_formats.extend(extra_datetime_formats)
else:
raise PyreadstatError("Unknown file format")
if extra_time_formats is not None:
if file_format == FILE_FORMAT_SAS:
sas_time_formats.extend(extra_time_formats)
sas_time_formats.extend([format_regex.match(etf).group(1) for etf in extra_time_formats if format_regex.match(etf)])
elif file_format == FILE_FORMAT_SPSS:
spss_time_formats.extend(extra_time_formats)
spss_time_formats.extend([format_regex.match(etf).group(1) for etf in extra_time_formats if format_regex.match(etf)])
elif file_format == FILE_FORMAT_STATA:
stata_time_formats.extend(extra_time_formats)
else:
Expand Down
Loading