From 68184a10e2d53ae84ec20dbb2a8c0bfe81bff437 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 17 Aug 2015 15:54:17 -0400 Subject: [PATCH 001/212] Add comments while walking through code. --- R/PSM_correlogram.R | 135 +++++++++++++++++++++++-------------------- R/PSMdistDecay.R | 40 ++++++------- R/SpatialCausalPSM.R | 100 ++++++++++++++++---------------- R/timeRangeTrend.R | 67 +++++++++++++-------- 4 files changed, 187 insertions(+), 155 deletions(-) diff --git a/R/PSM_correlogram.R b/R/PSM_correlogram.R index 2e44f33..f4f9107 100644 --- a/R/PSM_correlogram.R +++ b/R/PSM_correlogram.R @@ -1,66 +1,77 @@ -#forked from SPDEP (sp.correlogram) +# forked from SPDEP (sp.correlogram) +PSM_correlogram <- function (dta, var, order = 1, style = "W", randomisation = TRUE, zero.policy = NULL, spChk = NULL, start, end) { -PSM_correlogram <- function (dta, var, order = 1, style = "W", - randomisation = TRUE, zero.policy = NULL, spChk = NULL, start,end) -{ - stopifnot(is.vector(var)) - if (any(is.na(var))) - stop("no NAs permitted in variable") - if (is.null(zero.policy)) - zero.policy <- get("zeroPolicy", envir = .spdepOptions) - stopifnot(is.logical(zero.policy)) - if (is.null(spChk)) - spChk <- get.spChkOption() - if (order < 1) - stop("order less than 1") - - #nblags <- nblag(neighbours, maxlag = order) - #r.nb <- dnearneigh(as.matrix(coordinates(dta_prj)),d1=c1,d2=c2) - nblags <- vector(mode = "list", length = order) - end = end * 1000 - start = start * 1000 - rng_increment = (end-start) / order - cur_step = start + rng_increment - cur_start = start - nblags[[1]] <- dnearneigh(dta,d1=cur_start,d2=cur_step) - binname <- list() - binname <- rbind(binname,(cur_start/1000)) - for(L in 2:order) - { - cur_start = cur_step - cur_step = cur_step + rng_increment - nblags[[L]] <- dnearneigh(dta,d1=cur_start,d2=cur_step) - binname <- rbind(binname,(cur_start/1000)) - } - cardnos <- vector(mode = "list", length = order) - for (i in 1:order) cardnos[[i]] <- table(card(nblags[[i]])) - nobs <- sapply(cardnos, function(x) sum(x[names(x) > "0"])) - #if (any(nobs < 3)) - # stop("sp.correlogram: too few included observations in higher lags:\n\treduce order.") - res <- matrix(NA, nrow = 0, ncol = 3) - cnt = 0 + stopifnot(is.vector(var)) + + if (any(is.na(var))) + stop("no NAs permitted in variable") + + if (is.null(zero.policy)) + zero.policy <- get("zeroPolicy", envir = .spdepOptions) + + stopifnot(is.logical(zero.policy)) + + if (is.null(spChk)) + spChk <- get.spChkOption() + + if (order < 1) + stop("order less than 1") + + #nblags <- nblag(neighbours, maxlag = order) + #r.nb <- dnearneigh(as.matrix(coordinates(dta_prj)),d1=c1,d2=c2) + + end = end * 1000 + start = start * 1000 + rng_increment = (end-start) / order + nblags <- vector(mode = "list", length = order) + binname <- list() + + cur_step = start + rng_increment + cur_start = start + nblags[[1]] <- dnearneigh(dta, d1=cur_start, d2=cur_step) + binname <- rbind(binname, (cur_start/1000)) + + for (L in 2:order) { + cur_start = cur_step + cur_step = cur_step + rng_increment + nblags[[L]] <- dnearneigh(dta, d1=cur_start, d2=cur_step) + binname <- rbind(binname, (cur_start/1000)) + } + + cardnos <- vector(mode = "list", length = order) + for (i in 1:order) { + cardnos[[i]] <- table(card(nblags[[i]])) + } + + nobs <- sapply(cardnos, function(x) sum(x[names(x) > "0"])) + + #if (any(nobs < 3)) + # stop("sp.correlogram: too few included observations in higher lags:\n\treduce order.") + res <- matrix(NA, nrow = 0, ncol = 3) + cnt = 0 + for (i in 1:order) { + if (nobs[[i]] == 0) { + cardnos <- cardnos[-i] + cnt = cnt + 1 + warn_str = paste("Bin h=",i,"was empty, and is excluded from the plot.") + warning(warn_str) + } else { + listw <- nb2listw(nblags[[i]], style = style, zero.policy = zero.policy) + res <- rbind(res, moran.test(var, listw, randomisation = randomisation, zero.policy = zero.policy)$estimate) + cur_rw = i - cnt + rownames(res)[cur_rw] <- binname[[i]] + } + } + order = order - cnt + #rownames(res) <- 1:order + + cardnos <- vector(mode = "list", length = order) for (i in 1:order) { - if(nobs[[i]] == 0) - { - cardnos <- cardnos[-i] - cnt = cnt + 1 - warn_str = paste("Bin h=",i,"was empty, and is excluded from the plot.") - warning(warn_str) - } else { - listw <- nb2listw(nblags[[i]], style = style, zero.policy = zero.policy) - res <- rbind(res,moran.test(var, listw, randomisation = randomisation, - zero.policy = zero.policy)$estimate) - cur_rw = i - cnt - rownames(res)[cur_rw] <- binname[[i]] - } + cardnos[[i]] <- table(card(nblags[[i]])) } - order = order - cnt - #rownames(res) <- 1:order - cardnos <- vector(mode = "list", length = order) - for (i in 1:order) cardnos[[i]] <- table(card(nblags[[i]])) - obj <- list(res = res, method = "I", cardnos = cardnos, - var = deparse(substitute(var))) - class(obj) <- "spcor" - print(obj) - obj + + obj <- list(res = res, method = "I", cardnos = cardnos, var = deparse(substitute(var))) + class(obj) <- "spcor" + print(obj) + obj } \ No newline at end of file diff --git a/R/PSMdistDecay.R b/R/PSMdistDecay.R index 20ae298..77ce202 100644 --- a/R/PSMdistDecay.R +++ b/R/PSMdistDecay.R @@ -1,22 +1,22 @@ -#PSM distance decay examination - should we enforce a threshold for matches or not? -#This is a wrapper for a heavily modified sp.correlogram from SPDEP -#This new function (PSM_correlogram) allows for the specification of distance bands -#Neighbors within each band are then tested for Moran's I correlation. -PSMdistDecay = function(dta,psm_col,start,end,h) -{ - #Produce a corellogram using Moran's I at varying resolutions - #First, convert to an equal-distance projection - - #Need to update this so it handles projections correctly every time. - #Currently hacked together. - - dta_prj_coords <- project(as.matrix(coordinates(dta)),"+proj=laea") - dta_prj <- as(dta,"data.frame") - coordinates(dta_prj) <- dta_prj_coords - - exec <- paste("PSM_correlogram(as.matrix(coordinates(dta_prj)),dta_prj$",psm_col,",order=",h,",zero.policy=TRUE,start=",start,",end=",end,")",sep="") - sp.cor <- eval(parse(text=exec)) - plot(sp.cor) - return(sp.cor) +# PSM distance decay examination - should we enforce a threshold for matches or not? +# This is a wrapper for a heavily modified sp.correlogram from SPDEP +# This new function (PSM_correlogram) allows for the specification of distance bands +# Neighbors within each band are then tested for Moran's I correlation. +PSMdistDecay = function(dta, psm_col, start, end, h) { + # Produce a corellogram using Moran's I at varying resolutions + + # First, convert to an equal-distance projection + # Need to update this so it handles projections correctly every time. + # Currently hacked together. + dta_prj_coords <- project(as.matrix(coordinates(dta)),"+proj=laea") + dta_prj <- as(dta,"data.frame") + coordinates(dta_prj) <- dta_prj_coords + + # run PSM_correlogram and return results + exec <- paste("PSM_correlogram(as.matrix(coordinates(dta_prj)),dta_prj$",psm_col,",order=",h,",zero.policy=TRUE,start=",start,",end=",end,")", sep="") + sp.cor <- eval(parse(text=exec)) + plot(sp.cor) + + return(sp.cor) } diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 6af506a..5284989 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -1,56 +1,56 @@ -SpatialCausalPSM <- function(dta, mtd,mdl,drop, visual) -{ - #Initialization - pltObjs <- list() - - #Method - if(mtd == "logit") - { - PSMfit <- glm(mdl, dta@data, family="binomial") - retData <- dta - retData$PSM_trtProb <- predict(PSMfit, dta@data, type="response") - } - if(mtd=="lm") - { - PSMfit <- lm(mdl, dta@data) +SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) {} + # Initialization + pltObjs <- list() + + # generate model based on method + if (mtd == "logit") { + # generalized linear model + PSMfit <- glm(mdl, dta@data, family="binomial") + else if (mtd == "lm") { + # linear model + PSMfit <- lm(mdl, dta@data) + } + + # copy data retData <- dta - retData$PSM_trtProb <- predict(PSMfit, dta@data, type="response") - } - - if(visual == "TRUE") - { - #Show user distributions. - pltObjs[[1]] <- GroupCompHist(retData, "PSM_trtProb","Initial PSM Balance",simple_out=FALSE) - print(summary(PSMfit)) - } - - #Second, if a drop parameter - if set to "support", remove observations - #that don't overlap in the PSM distribution. - if(drop == "support") - { + # predict values based on model + retData$PSM_trtProb <- predict(PSMfit, dta@data, type="response") - #Drop - treated <- retData@data[retData@data$TrtBin == 1,] - untreated <- retData@data[retData@data$TrtBin == 0,] - min_cut <- max(min(treated$PSM_trtProb), min(untreated$PSM_trtProb)) - max_cut <- min(max(treated$PSM_trtProb), max(untreated$PSM_trtProb)) - retData <- retData[retData@data$PSM_trtProb >= min_cut,] - retData <- retData[retData@data$PSM_trtProb <= max_cut,] + if (visual == "TRUE") { + # Show user distributions. + pltObjs[[1]] <- GroupCompHist(retData, "PSM_trtProb", "Initial PSM Balance", simple_out=FALSE) + print(summary(PSMfit)) + } - } - if(visual == "TRUE") - { - #Post drop histograms - pltObjs[[2]] <- GroupCompHist(retData, "PSM_trtProb","Post-Extrapolation Drops (if enabled)",simple_out=FALSE) - - #Output graphics - grid.arrange(pltObjs[[1]], pltObjs[[2]],ncol=2,main="PSM Matching Stage 1 (Dropping Observations Requiring Extrapolation)") - - } - retEle <- 0 - retEle$data <- retData - retEle$model <- PSMfit - return (retEle) + # Second, if a drop parameter - if set to "support", remove observations + # that don't overlap in the PSM distribution. + if (drop == "support") { + + # Drop + treated <- retData@data[retData@data$TrtBin == 1,] + untreated <- retData@data[retData@data$TrtBin == 0,] + min_cut <- max(min(treated$PSM_trtProb), min(untreated$PSM_trtProb)) + max_cut <- min(max(treated$PSM_trtProb), max(untreated$PSM_trtProb)) + + retData <- retData[retData@data$PSM_trtProb >= min_cut,] + retData <- retData[retData@data$PSM_trtProb <= max_cut,] + + } + + if (visual == "TRUE") { + # Post drop histograms + pltObjs[[2]] <- GroupCompHist(retData, "PSM_trtProb", "Post-Extrapolation Drops (if enabled)", simple_out=FALSE) + + # Output graphics + grid.arrange(pltObjs[[1]], pltObjs[[2]], ncol=2, main="PSM Matching Stage 1 (Dropping Observations Requiring Extrapolation)") + + } + + # return original and predicted data along with model + retEle <- 0 + retEle$data <- retData + retEle$model <- PSMfit + return (retEle) } diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index 33c6e87..85f4afa 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -1,23 +1,44 @@ -timeRangeTrend <- function(dta,prefix,startyr,endyr,IDfield) -{ - grep_str = paste(IDfield,prefix,sep="|") - tDF <- dta@data[grepl(grep_str,names(dta@data))] - analysisDF <- melt(tDF,id=c(IDfield)) - - #cleaned GREP - new_pre <- gsub("[0-9]","",prefix,fixed=TRUE) - analysisDF["Year"] <- lapply(analysisDF["variable"],FUN=function(x) as.numeric(gsub(new_pre,"",x))) - analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] - analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] - dta@data["newfieldID"] <- 0 - for (i in 1:length(dta)) - { - ID <- as.character(dta@data[IDfield][i,]) - #Fit trend model - ID_dat <- analysisDF[analysisDF[IDfield] == ID,] - trend_mod <- lm(value ~ Year,data=ID_dat) - dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] - } - return(dta$newfieldID) - -} \ No newline at end of file +# run linear model on data within year range as specified +# by field prefix and return coefficients +timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield) { + + # create new dataframe from all columns in dta dataframe that + # are either the ID or a year which is indicated by the prefix + grep_str = paste(IDfield, prefix, sep="|") + tDF <- dta@data[grepl(grep_str, names(dta@data))] + + # melt all years columns in new dataframe + analysisDF <- melt(tDF, id=c(IDfield)) + + # cleaned GREP - remove year digit placeholders + new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) + + # generate new year field by removing prefix from variable (original column names) + analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) + + # keep years in range specified + analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] + analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] + + # create empty field + dta@data["newfieldID"] <- 0 + + # iterate over original dataframe + for (i in 1:length(dta)) { + # get id for row (in original data) + ID <- as.character(dta@data[IDfield][i,]) + + # get all data corresponding to id from analysis dataframe + ID_dat <- analysisDF[analysisDF[IDfield] == ID,] + + # fit trend model + trend_mod <- lm(value ~ Year, data=ID_dat) + + # add trend coefficients to new field + dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] + } + + # return new field with trend coefficients + return(dta$newfieldID) + +} From d3b84711139d1c430f67c64cd8bffe2c810b1005 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 09:44:52 -0400 Subject: [PATCH 002/212] Clean BuildTimeSeries.R --- DESCRIPTION | 4 +- R/BuildTimeSeries.R | 246 +++++++++++++++++++++----------------------- 2 files changed, 121 insertions(+), 129 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9f1486f..8d8fcf7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,4 +1,4 @@ -Package: SCI +Package: SCI2 Title: Spatial Cauasal Inference Version: 0.0.0.1 Authors@R: person("Dan", "Runfola", , "drunfola@aiddata.org", role = c("aut", "cre")) @@ -6,4 +6,4 @@ Description: An alpha release of a package designed to make impact evaluation th Depends: R (>= 3.0.2), sp, maptools, reshape, ggplot2, grid, gridExtra, spdep,proj4, FNN, psych License: Creative Commons with Attribution LazyData: true -Imports:sp, maptools, reshape, ggplot2, grid, gridExtra, spdep,proj4, FNN, psych \ No newline at end of file +Imports:sp, maptools, reshape, ggplot2, grid, gridExtra, spdep,proj4, FNN, psych diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index a470ad8..59b4190 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -1,147 +1,139 @@ -BuildTimeSeries <- function(dta,idField,varList_pre,startYear,endYear,colYears=NULL,interpYears=NULL) -{ - years <- startYear:endYear - #If there is a "colYears" variable, convert to binaries. - #Eventually could be extended to more than one column. - if(!is.null(colYears)) - { - #For each variable, for each year, create a binary representing the treatment status. - for(k in 1:length(years)) - { - for(j in 1:length(colYears)) - { - varN <- paste("TrtMnt_",colYears[j],years[k],sep="") - exec <- paste("dta$",varN,"=0",sep="") - eval(parse(text=exec)) +BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colYears=NULL, interpYears=NULL) { + + years <- startYear:endYear - dta@data[varN][dta@data[colYears[j]] <= as.numeric(years[k])] <- 1 - } - } - } - for (j in 1:length(colYears)) - { - trt_id = paste("TrtMnt_",colYears[j],sep="") - interpYears <- c(interpYears,trt_id) - } - print(interpYears) - #If there is an "interpVars" variable, linearly interpolate values based on at least 2 known points in time. - if(!is.null(interpYears)) - { - for(AncInt in 1:length(interpYears)) - { - cur_ancVi <- interpYears[AncInt] - interpFrame <- dta@data[idField] - interpFrame[idField] <- dta@data[idField] - cnt = 2 - for(k in 1:length(years)) - { - #First, build a model describing the relationship between years and any data in the interp field. - varI <- paste(cur_ancVi,years[[k]],sep="") - #Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. - if(varI %in% colnames(dta@data)) - { - add_data <- paste("interpFrame[cnt] <- dta@data$",varI) - eval(parse(text=add_data)) - colnames(interpFrame)[cnt] <- years[[k]] - cnt = cnt + 1 - } else - { - #Exception for a single-point interpolation - varC <- paste(cur_ancVi,sep="") - if(varC %in% colnames(dta@data)) - { - add_data <- paste("interpFrame[cnt] <- dta@data$",varC) - eval(parse(text=add_data)) - cnt = 3 + # If there is a "colYears" variable, convert to binaries. + # Eventually could be extended to more than one column. + if (!is.null(colYears)) { + # For each variable, for each year, create a binary representing the treatment status. + for (k in 1:length(years)) { + for (j in 1:length(colYears)) { + varN <- paste("TrtMnt_",colYears[j],years[k],sep="") + exec <- paste("dta$",varN,"=0",sep="") + eval(parse(text=exec)) + + dta@data[varN][dta@data[colYears[j]] <= as.numeric(years[k])] <- 1 } - } - } - #Only one time point, so no interpolation is done - value is simply copied to all other columns. - if(cnt == 3) - { - for(k in 1:length(years)) - { - varI <- paste("dta@data$",cur_ancVi,years[[k]]," <- interpFrame[2]",sep="") - eval(parse(text=varI)) - } - } else { - tDframe <- dta@data[idField] - #Here, we model out everything. - #Melt the dataframe for modeling - melt_Model_dta <- melt(data.frame(interpFrame),id=idField) - melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta$variable)) - - #Fit the model for interpolation - execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)",sep="") - eval(parse(text=execstr)) - #Apply the model to interpolate - for(u in 1:length(years)) - { - varI <- paste(cur_ancVi,years[[u]],sep="") - if(!(varI %in% colnames(dta@data))) - { - #Variable doesn't exist, so we need to interpolate. - tDframe[idField] <- dta@data[idField] - tDframe["variable"] <- years[[u]] - dta@data[varI] <- predict(mdl,newdata=tDframe) + } + for (j in 1:length(colYears)) { + trt_id = paste("TrtMnt_",colYears[j],sep="") + interpYears <- c(interpYears,trt_id) + } + + print(interpYears) + + # If there is an "interpVars" variable, linearly interpolate values based on at least 2 known points in time. + if (!is.null(interpYears)) { + for (AncInt in 1:length(interpYears)) { + cur_ancVi <- interpYears[AncInt] + interpFrame <- dta@data[idField] + interpFrame[idField] <- dta@data[idField] + cnt = 2 + for (k in 1:length(years)) { + # First, build a model describing the relationship between years and any data in the interp field. + varI <- paste(cur_ancVi,years[[k]],sep="") + # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. + if (varI %in% colnames(dta@data)) { + add_data <- paste("interpFrame[cnt] <- dta@data$",varI) + eval(parse(text=add_data)) + colnames(interpFrame)[cnt] <- years[[k]] + cnt = cnt + 1 + } else { + # Exception for a single-point interpolation + varC <- paste(cur_ancVi,sep="") + if (varC %in% colnames(dta@data)) { + add_data <- paste("interpFrame[cnt] <- dta@data$",varC) + eval(parse(text=add_data)) + cnt = 3 + } + } + + } + + # Only one time point, so no interpolation is done - value is simply copied to all other columns. + if (cnt == 3) { + + for (k in 1:length(years)) { + varI <- paste("dta@data$",cur_ancVi,years[[k]]," <- interpFrame[2]",sep="") + eval(parse(text=varI)) + } + + } else { + tDframe <- dta@data[idField] + + # Here, we model out everything. + # Melt the dataframe for modeling + melt_Model_dta <- melt(data.frame(interpFrame),id=idField) + melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta$variable)) + + # Fit the model for interpolation + execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)",sep="") + eval(parse(text=execstr)) + + # Apply the model to interpolate + for (u in 1:length(years)) { + varI <- paste(cur_ancVi,years[[u]],sep="") + if (!(varI %in% colnames(dta@data))) { + # Variable doesn't exist, so we need to interpolate. + tDframe[idField] <- dta@data[idField] + tDframe["variable"] <- years[[u]] + dta@data[varI] <- predict(mdl,newdata=tDframe) + + } + } } - } - } - } - - #Append interpolated fields to our melting lists - for(v in 1:length(interpYears)) - { - varList_pre[[length(varList_pre)+1]] <- interpYears[v] - } + } + + # Append interpolated fields to our melting lists + for (v in 1:length(interpYears)) { + varList_pre[[length(varList_pre)+1]] <- interpYears[v] + } - } + } - #Run the melts + # Run the melts + meltList <- list() + for (i in 1:length(varList_pre)) { - meltList <- list() - for (i in 1:length(varList_pre)) - { - #grep_str = paste(idField,"|",varList_pre[i],"[0-9][0-9][0-9][0-9]",sep="") - #Limit to only relevant years - grepStrYrs = idField - for(j in 1:length(years)) - { - tempGrep <- grepStrYrs - grepStrYrs <- paste(tempGrep,"|",varList_pre[[i]],years[[j]],sep="") - } - tDF <- dta@data[grepl(grepStrYrs,names(dta@data))] - meltList[[i]] <- melt(tDF,id=idField) - - #Keep only years in the year column, rename columns - colnames(meltList[[i]])[2] <- "Year" - colnames(meltList[[i]])[3] <- varList_pre[[i]] - - #Clean up year column - gsub_command <- paste("^",varList_pre[[i]],sep="") - meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) - + # grep_str = paste(idField,"|",varList_pre[i],"[0-9][0-9][0-9][0-9]",sep="") + # Limit to only relevant years + grepStrYrs = idField + for (j in 1:length(years)) { + tempGrep <- grepStrYrs + grepStrYrs <- paste(tempGrep,"|",varList_pre[[i]],years[[j]],sep="") + } + + tDF <- dta@data[grepl(grepStrYrs,names(dta@data))] + meltList[[i]] <- melt(tDF,id=idField) + + # Keep only years in the year column, rename columns + colnames(meltList[[i]])[2] <- "Year" + colnames(meltList[[i]])[3] <- varList_pre[[i]] + + # Clean up year column + gsub_command <- paste("^",varList_pre[[i]],sep="") + meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) + - #Remove ID and year if this is at least the second variable to avoid duplications. - if(i > 1) - { - meltList[[i]] <- meltList[[i]][3] - } + # Remove ID and year if this is at least the second variable to avoid duplications. + if (i > 1) { + meltList[[i]] <- meltList[[i]][3] + } - } + } - #Finish up with a cherry on top - meltListRet <- data.frame(meltList) + # Finish up with a cherry on top + meltListRet <- data.frame(meltList) - return(meltListRet) + return(meltListRet) } # dm1 <- melt(d[,c("Type","I.alt","idx06","idx07","idx08")], id=c("Type","I.alt")) # dm2 <- melt(d[,c("Type","I.alt","farve1","farve2")], id=c("Type","I.alt")) # colnames(dm2) <- c("Type", "I.alt", "variable2", "value2") -# dm <- merge(dm1, dm2) \ No newline at end of file +# dm <- merge(dm1, dm2) From c8bb7ad729f0a93fc0d425c599f0c223f7488b83 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 09:47:39 -0400 Subject: [PATCH 003/212] Fix bug. --- R/SpatialCausalPSM.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 5284989..a693a36 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -6,7 +6,7 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) {} if (mtd == "logit") { # generalized linear model PSMfit <- glm(mdl, dta@data, family="binomial") - else if (mtd == "lm") { + } else if (mtd == "lm") { # linear model PSMfit <- lm(mdl, dta@data) } From 4f7240fb06ec519877ed88c5eaf9449c1a9238d1 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 09:48:30 -0400 Subject: [PATCH 004/212] Fix bug. --- R/SpatialCausalPSM.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index a693a36..4a284c4 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -1,4 +1,4 @@ -SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) {} +SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { # Initialization pltObjs <- list() From cd1193a8a22ad008969b211526c78e2cd54238f7 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 10:06:01 -0400 Subject: [PATCH 005/212] Remove some eval statments from buildtimeseries func. --- R/BuildTimeSeries.R | 78 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 11 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 59b4190..8c8dda2 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -2,27 +2,54 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY years <- startYear:endYear + + + print("bts1") + timer <- proc.time() + # If there is a "colYears" variable, convert to binaries. # Eventually could be extended to more than one column. if (!is.null(colYears)) { # For each variable, for each year, create a binary representing the treatment status. for (k in 1:length(years)) { for (j in 1:length(colYears)) { - varN <- paste("TrtMnt_",colYears[j],years[k],sep="") - exec <- paste("dta$",varN,"=0",sep="") - eval(parse(text=exec)) + + varN <- paste("TrtMnt_", colYears[j], years[k], sep="") + + + # exec <- paste("dta$",varN,"=0",sep="") + # eval(parse(text=exec)) + + dta[,varN] = 0 + dta@data[varN][dta@data[colYears[j]] <= as.numeric(years[k])] <- 1 } } } + timer <- proc.time() - timer + print(paste("section completed in", timer[3], "seconds.")) + + + + print("bts2") + timer <- proc.time() + for (j in 1:length(colYears)) { trt_id = paste("TrtMnt_",colYears[j],sep="") interpYears <- c(interpYears,trt_id) } print(interpYears) + + timer <- proc.time() - timer + print(paste("section completed in", timer[3], "seconds.")) + + + + print("bts3") + timer <- proc.time() # If there is an "interpVars" variable, linearly interpolate values based on at least 2 known points in time. if (!is.null(interpYears)) { @@ -36,16 +63,26 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varI <- paste(cur_ancVi,years[[k]],sep="") # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. if (varI %in% colnames(dta@data)) { - add_data <- paste("interpFrame[cnt] <- dta@data$",varI) - eval(parse(text=add_data)) + + + # add_data <- paste("interpFrame[cnt] <- dta@data$",varI) + # eval(parse(text=add_data)) + interpFrame[cnt] <- dta@data[,varI] + + colnames(interpFrame)[cnt] <- years[[k]] cnt = cnt + 1 } else { # Exception for a single-point interpolation varC <- paste(cur_ancVi,sep="") if (varC %in% colnames(dta@data)) { - add_data <- paste("interpFrame[cnt] <- dta@data$",varC) - eval(parse(text=add_data)) + + + # add_data <- paste("interpFrame[cnt] <- dta@data$",varC) + # eval(parse(text=add_data)) + interpFrame[cnt] <- dta@data[,varC] + + cnt = 3 } } @@ -56,22 +93,28 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (cnt == 3) { for (k in 1:length(years)) { - varI <- paste("dta@data$",cur_ancVi,years[[k]]," <- interpFrame[2]",sep="") - eval(parse(text=varI)) + + # varI <- paste("dta@data$",cur_ancVi,years[[k]]," <- interpFrame[2]",sep="") + # eval(parse(text=varI)) + dta@data[,cur_ancVi,years[[k]]] <- interpFrame[2] } } else { tDframe <- dta@data[idField] - + # Here, we model out everything. # Melt the dataframe for modeling melt_Model_dta <- melt(data.frame(interpFrame),id=idField) melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta$variable)) # Fit the model for interpolation + execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)",sep="") eval(parse(text=execstr)) + # mdl <- lm(value ~ variable + factor(idField), data=melt_Model_dta) + + # Apply the model to interpolate for (u in 1:length(years)) { varI <- paste(cur_ancVi,years[[u]],sep="") @@ -94,7 +137,14 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } - + timer <- proc.time() - timer + print(paste("section completed in", timer[3], "seconds.")) + + + + print("bts4") + timer <- proc.time() + # Run the melts meltList <- list() for (i in 1:length(varList_pre)) { @@ -126,6 +176,12 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } + + timer <- proc.time() - timer + print(paste("section completed in", timer[3], "seconds.")) + + + # Finish up with a cherry on top meltListRet <- data.frame(meltList) From a2a378c874044b77705197bd33bfb537d237a570 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 10:36:20 -0400 Subject: [PATCH 006/212] Testing. --- R/BuildTimeSeries.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 8c8dda2..34286b8 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -16,11 +16,11 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varN <- paste("TrtMnt_", colYears[j], years[k], sep="") + print(varN) + exec <- paste("dta$",varN,"=0",sep="") + eval(parse(text=exec)) - # exec <- paste("dta$",varN,"=0",sep="") - # eval(parse(text=exec)) - - dta[,varN] = 0 + # dta[,varN] = 0 dta@data[varN][dta@data[colYears[j]] <= as.numeric(years[k])] <- 1 From 6f3ffbee01f95b713ce067768ffa2f84726beb1b Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 10:56:30 -0400 Subject: [PATCH 007/212] Testing. --- R/BuildTimeSeries.R | 1 + R/SAT.R | 375 +++++++++++++++++++++----------------------- 2 files changed, 180 insertions(+), 196 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 34286b8..a91cd2e 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -61,6 +61,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY for (k in 1:length(years)) { # First, build a model describing the relationship between years and any data in the interp field. varI <- paste(cur_ancVi,years[[k]],sep="") + print(varI) # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. if (varI %in% colnames(dta@data)) { diff --git a/R/SAT.R b/R/SAT.R index f630f35..fb96d11 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -1,227 +1,210 @@ -SAT <- function(dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinColName) -{ - #Initialization - pltObjs <- list() - init_dta <- dta +SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinColName) { + #Initialization + pltObjs <- list() + init_dta <- dta - drop_unmatched = drop_opts["drop_unmatched"] - drop_method = drop_opts["drop_method"] - drop_thresh = as.numeric(drop_opts["drop_thresh"]) + drop_unmatched = drop_opts["drop_unmatched"] + drop_method = drop_opts["drop_method"] + drop_thresh = as.numeric(drop_opts["drop_thresh"]) - if(!is.null(constraints)) - { - for(cst in 1:length(names(constraints))) - { - if(names(constraints)[cst] == "groups") - { - exec_stmnt = paste("dta$ConstraintGroupSet_Opt <- dta$",constraints["groups"],sep="") - eval(parse(text=exec_stmnt)) - } else - { + if (!is.null(constraints)) { + for (cst in 1:length(names(constraints))) { + if(names(constraints)[cst] == "groups") { + exec_stmnt = paste("dta$ConstraintGroupSet_Opt <- dta$",constraints["groups"],sep="") + eval(parse(text=exec_stmnt)) + } else { + dta$ConstraintGroupSet_Opt <- 1 + } + if (names(constraints)[cst] == "distance") { + dist_PSM = as.numeric(constraints["distance"][[1]]) + } else { + dist_PSM=NULL + } + } + } else { dta$ConstraintGroupSet_Opt <- 1 - } - if(names(constraints)[cst] == "distance") - { - dist_PSM = as.numeric(constraints["distance"][[1]]) - } else - { - dist_PSM=NULL - } + #max the distance threshold by taking the diagonal of the bounding box. + dist_PSM = NULL } - } else { - dta$ConstraintGroupSet_Opt <- 1 - #max the distance threshold by taking the diagonal of the bounding box. - dist_PSM = NULL - } - #Caclulate the number of groups to constrain by, if any. - group_constraints <- unique(dta$ConstraintGroupSet_Opt) + #Caclulate the number of groups to constrain by, if any. + group_constraints <- unique(dta$ConstraintGroupSet_Opt) - #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) - t_dta <- list() - u_dta <-list() - grp_list <- list() - cnt = 0 - for (grp in 1:length(group_constraints)) - { - cur_grp <- as.matrix(group_constraints)[grp] - grp_index = length(grp_list)+1 - t_index = length(t_dta)+1 - grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] - t_dta[[t_index]] <- dta[dta$TrtBin == 1,] - u_dta[[t_index]] <- dta[dta$TrtBin == 0,] - treatment_count <- cur_grp %in% t_dta[[t_index]]$ConstraintGroupSet_Opt - untreated_count <- cur_grp %in% u_dta[[t_index]]$ConstraintGroupSet_Opt - if((untreated_count == FALSE) || (treatment_count == FALSE)) - { - dta <- dta[!dta$ConstraintGroupSet_Opt == cur_grp,] - t_dta[[t_index]] <- NULL - u_dta[[t_index]] <- NULL - grp_list[[t_index]] <- NULL - war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") - warning(war_statement) - } else { - - t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] - u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) + t_dta <- list() + u_dta <-list() + grp_list <- list() + cnt = 0 + for (grp in 1:length(group_constraints)) { + cur_grp <- as.matrix(group_constraints)[grp] + grp_index = length(grp_list)+1 + t_index = length(t_dta)+1 + grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] + t_dta[[t_index]] <- dta[dta$TrtBin == 1,] + u_dta[[t_index]] <- dta[dta$TrtBin == 0,] + treatment_count <- cur_grp %in% t_dta[[t_index]]$ConstraintGroupSet_Opt + untreated_count <- cur_grp %in% u_dta[[t_index]]$ConstraintGroupSet_Opt + if ((untreated_count == FALSE) || (treatment_count == FALSE)) { + dta <- dta[!dta$ConstraintGroupSet_Opt == cur_grp,] + t_dta[[t_index]] <- NULL + u_dta[[t_index]] <- NULL + grp_list[[t_index]] <- NULL + war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") + warning(war_statement) + } else { + t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] - cnt = cnt + 1 + cnt = cnt + 1 + } } - } - temp_dta <- list() -for(i in 1:cnt) - { - cur_grp <- grp_list[[i]] - it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) + + temp_dta <- list() + + for (i in 1:cnt) { + cur_grp <- grp_list[[i]] + it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) - if (mtd == "fastNN") - { - temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) - } + if (mtd == "fastNN") { + temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + } - if (mtd == "NN_WithReplacement") - { - print("NN with replacement is currently not available, please choose fastNN") - # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + if (mtd == "NN_WithReplacement") { + print("NN with replacement is currently not available, please choose fastNN") + # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + } + } + + #Build the final datasets from subsets + if (cnt > 1) { + dta <- temp_dta[[1]] + for(k in 2:cnt) { + dta <- maptools::spRbind(dta, temp_dta[[k]]) + } + } else { + dta <- temp_dta[[1]] } - } - -#Build the final datasets from subsets -if(cnt > 1) -{ - dta <- temp_dta[[1]] - for(k in 2:cnt) - { - dta <- maptools::spRbind(dta, temp_dta[[k]]) - } -} else { - dta <- temp_dta[[1]] -} - if (drop_unmatched == TRUE) - { - dta <- dta[dta@data$PSM_match_ID != -999,] - } + if (drop_unmatched == TRUE) { + dta <- dta[dta@data$PSM_match_ID != -999,] + } - anc_v_int <- strsplit(psm_eq, "~")[[1]][2] - anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) - anc_vars <- c(anc_vars[[1]], "PSM_trtProb") + anc_v_int <- strsplit(psm_eq, "~")[[1]][2] + anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) + anc_vars <- c(anc_vars[[1]], "PSM_trtProb") - #Drop observations according to the selected method - if(drop_method == "SD") - { - #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. - psm_sd_thresh = sd(dta$PSM_trtProb) * drop_thresh - if(visual == "TRUE") - { - print(psm_sd_thresh) + #Drop observations according to the selected method + if (drop_method == "SD") { + #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. + psm_sd_thresh = sd(dta$PSM_trtProb) * drop_thresh + if (visual == "TRUE") { + print(psm_sd_thresh) + } + dta <- dta[dta@data$PSM_distance < psm_sd_thresh,] } - dta <- dta[dta@data$PSM_distance < psm_sd_thresh,] - } - #Plot the pre and post-dropping balance for PSM model... - #Balance metrics are based on "Misunderstandings between experimentalists and - #observationalists about causal inference", Imal, King, and Stuart. - #Simplest suggestion of comparing means and checking if .25 SD apart used. - cnt = 0 - for (i in 1:length(anc_vars)) - { - #gsub to remove any factors() - ed_v = sub("factor\\(","",anc_vars[i]) - ed_v = sub(")","",ed_v) - treat_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[3]],5)") - treat_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[4]],5)") - - control_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[3]],5)") - control_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[4]],5)") - - treat_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[3]],5)") - treat_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[4]],5)") - - control_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[3]],5)") - control_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[4]],5)") - - c_type = eval(parse(text=paste("class(init_dta@data$",ed_v,")"))) + #Plot the pre and post-dropping balance for PSM model... + #Balance metrics are based on "Misunderstandings between experimentalists and + #observationalists about causal inference", Imal, King, and Stuart. + #Simplest suggestion of comparing means and checking if .25 SD apart used. + cnt = 0 - if(c_type == "matrix") - { - exec_str = paste("dta@data$",ed_v,"<- as.numeric(dta@data$",ed_v,")",sep="") - eval(parse(text=exec_str)) - - exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") - eval(parse(text=exec_str)) - c_type = "numeric" - } - if((c_type == "numeric") & (visual == "TRUE")) - { - cnt = cnt + 1 - pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) - pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) - - treat_mean_pre <- eval(parse(text=treat_mean_pre)) - treat_SD_pre <- eval(parse(text=treat_SD_pre)) - control_mean_pre <- eval(parse(text=control_mean_pre)) - control_SD_pre <- eval(parse(text=control_SD_pre)) - - treat_mean_post <- eval(parse(text=treat_mean_post)) - treat_SD_post <- eval(parse(text=treat_SD_post)) - control_mean_post <- eval(parse(text=control_mean_post)) - control_SD_post <- eval(parse(text=control_SD_post)) - - it_diff_Mean_pre <- round(abs( treat_mean_pre-control_mean_pre ),5) - it_diff_Mean_post <- round(abs(treat_mean_post-control_mean_post),5) - - if(!exists("bRes")) - { - bRes <- data.frame(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, - treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, - it_diff_Mean_pre,it_diff_Mean_post) - colnames(bRes)[1] <- "Pre-Balance Treated Mean" - colnames(bRes)[2] <- "Pre-Balance Treated SD" - colnames(bRes)[3] <- "Pre-Balance Control Mean" - colnames(bRes)[4] <- "Pre-Balance Control SD" + for (i in 1:length(anc_vars)) { + #gsub to remove any factors() + ed_v = sub("factor\\(","",anc_vars[i]) + ed_v = sub(")","",ed_v) + treat_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[3]],5)") + treat_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[4]],5)") - colnames(bRes)[5] <- "Post-Balance Treated Mean" - colnames(bRes)[6] <- "Post-Balance Treated SD" - colnames(bRes)[7] <- "Post-Balance Control Mean" - colnames(bRes)[8] <- "Post-Balance Control SD" + control_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[3]],5)") + control_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[4]],5)") - colnames(bRes)[9] <- "Mean Difference Pre-Balance" - colnames(bRes)[10] <- "Mean Difference Post-Balance" - }else{ - bRes <- rbind(bRes, c(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, + treat_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[3]],5)") + treat_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[4]],5)") + + control_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[3]],5)") + control_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[4]],5)") + + # c_type = eval(parse(text=paste("class(init_dta@data$",ed_v,")"))) + c_type = class(init_dta@data[,ed_v]) + + if (c_type == "matrix") { + + # exec_str = paste("dta@data$",ed_v,"<- as.numeric(dta@data$",ed_v,")",sep="") + # eval(parse(text=exec_str)) + dta@data[,ed_v] <- as.numeric(dta@data[,ed_v] + + # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") + # eval(parse(text=exec_str)) + init_dta@data[,ed_v] <- as.numeric(init_dta@data[,ed_v] + + c_type = "numeric" + } + + if ((c_type == "numeric") & (visual == "TRUE")) { + cnt = cnt + 1 + pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) + pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) + + treat_mean_pre <- eval(parse(text=treat_mean_pre)) + treat_SD_pre <- eval(parse(text=treat_SD_pre)) + control_mean_pre <- eval(parse(text=control_mean_pre)) + control_SD_pre <- eval(parse(text=control_SD_pre)) + + treat_mean_post <- eval(parse(text=treat_mean_post)) + treat_SD_post <- eval(parse(text=treat_SD_post)) + control_mean_post <- eval(parse(text=control_mean_post)) + control_SD_post <- eval(parse(text=control_SD_post)) + + it_diff_Mean_pre <- round(abs( treat_mean_pre-control_mean_pre ),5) + it_diff_Mean_post <- round(abs(treat_mean_post-control_mean_post),5) + + if(!exists("bRes")) { + bRes <- data.frame(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, + treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, + it_diff_Mean_pre,it_diff_Mean_post) + colnames(bRes)[1] <- "Pre-Balance Treated Mean" + colnames(bRes)[2] <- "Pre-Balance Treated SD" + colnames(bRes)[3] <- "Pre-Balance Control Mean" + colnames(bRes)[4] <- "Pre-Balance Control SD" + + colnames(bRes)[5] <- "Post-Balance Treated Mean" + colnames(bRes)[6] <- "Post-Balance Treated SD" + colnames(bRes)[7] <- "Post-Balance Control Mean" + colnames(bRes)[8] <- "Post-Balance Control SD" + + colnames(bRes)[9] <- "Mean Difference Pre-Balance" + colnames(bRes)[10] <- "Mean Difference Post-Balance" + } else { + bRes <- rbind(bRes, c(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, it_diff_Mean_pre,it_diff_Mean_post)) - } + } - - rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]","",ed_v) + rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]","",ed_v) + } } - } - if(visual=="TRUE") - { - #Output graphics - #Remove the factor rows - nrow_c <- length(pltObjs) - counter <- 1 - while(counter <= nrow_c) - { - d = counter + 3 - if(d > nrow_c) - { - d = nrow_c - } - do.call(grid.arrange,c(pltObjs[counter:d],nrow=2,ncol=2)) - counter = counter + 4 + if (visual=="TRUE") { + #Output graphics + #Remove the factor rows + nrow_c <- length(pltObjs) + counter <- 1 + while(counter <= nrow_c) { + d = counter + 3 + if(d > nrow_c) { + d = nrow_c + } + do.call(grid.arrange,c(pltObjs[counter:d],nrow=2,ncol=2)) + counter = counter + 4 + } + #bTab <- stargazer(bRes,summary=FALSE,type="html") + #print.htmlTable(bTab) } - #bTab <- stargazer(bRes,summary=FALSE,type="html") - #print.htmlTable(bTab) - } - return (dta) + return (dta) } From 88581428fb3ae98e3533eba1a1add3b5bdea4725 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 10:57:55 -0400 Subject: [PATCH 008/212] Testing. --- R/SAT.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index fb96d11..59b0798 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -138,7 +138,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") # eval(parse(text=exec_str)) - init_dta@data[,ed_v] <- as.numeric(init_dta@data[,ed_v] + init_dta@data[,ed_v] <- as.numeric(init_dta@data[,ed_v]) c_type = "numeric" } From f9e28e25807fe94b0c40f0faf72bc39ae13c8006 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 10:59:21 -0400 Subject: [PATCH 009/212] Testing. --- R/SAT.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index 59b0798..7569e1c 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -138,7 +138,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") # eval(parse(text=exec_str)) - init_dta@data[,ed_v] <- as.numeric(init_dta@data[,ed_v]) + init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]) c_type = "numeric" } From 00f9515bf431d400d7047a3ab9362e145e4f36e6 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 10:59:50 -0400 Subject: [PATCH 010/212] Testing. --- R/SAT.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 7569e1c..fc793e7 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -136,9 +136,9 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # eval(parse(text=exec_str)) dta@data[,ed_v] <- as.numeric(dta@data[,ed_v] - # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") - # eval(parse(text=exec_str)) - init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]) + exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") + eval(parse(text=exec_str)) + # init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]) c_type = "numeric" } From 02b3a6eac70ed238c5c1883a6b97d3bee7ef7c8c Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:00:19 -0400 Subject: [PATCH 011/212] Testing. --- R/SAT.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index fc793e7..5917a9b 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -134,11 +134,11 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # exec_str = paste("dta@data$",ed_v,"<- as.numeric(dta@data$",ed_v,")",sep="") # eval(parse(text=exec_str)) - dta@data[,ed_v] <- as.numeric(dta@data[,ed_v] + dta@data[,ed_v] <- as.numeric(dta@data[,ed_v]) - exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") - eval(parse(text=exec_str)) - # init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]) + # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") + # eval(parse(text=exec_str)) + init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]) c_type = "numeric" } From 849e133068a9d6ff1b75a11c9def0bc00b00999c Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:00:45 -0400 Subject: [PATCH 012/212] Testing. --- R/SAT.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index 5917a9b..906ced7 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -138,7 +138,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") # eval(parse(text=exec_str)) - init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]) + init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]]) c_type = "numeric" } From fca8884cb771ccbf16958cca73dedaca969913ab Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:02:42 -0400 Subject: [PATCH 013/212] Get rid of eval statements in SAT.R. --- R/SAT.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 906ced7..bc1b3b4 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -10,13 +10,15 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if (!is.null(constraints)) { for (cst in 1:length(names(constraints))) { if(names(constraints)[cst] == "groups") { - exec_stmnt = paste("dta$ConstraintGroupSet_Opt <- dta$",constraints["groups"],sep="") - eval(parse(text=exec_stmnt)) + # exec_stmnt = paste("dta$ConstraintGroupSet_Opt <- dta$",constraints["groups"],sep="") + # eval(parse(text=exec_stmnt)) + dta[,ConstraintGroupSet_Opt] <- dta[,constraints["groups"]] + } else { dta$ConstraintGroupSet_Opt <- 1 } if (names(constraints)[cst] == "distance") { - dist_PSM = as.numeric(constraints["distance"][[1]]) + dist_PSM = as.numeric(constraints["distance"][[1]]) } else { dist_PSM=NULL } @@ -138,7 +140,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") # eval(parse(text=exec_str)) - init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]]) + init_dta@data[,ed_v] <- as.numeric(init_dta@data[,ed_v]) c_type = "numeric" } From 604dc4318d6ab5318b0134b340f1b5edd5d67f8b Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:04:26 -0400 Subject: [PATCH 014/212] Fix bug. --- R/SAT.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index bc1b3b4..e40ae58 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -12,7 +12,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if(names(constraints)[cst] == "groups") { # exec_stmnt = paste("dta$ConstraintGroupSet_Opt <- dta$",constraints["groups"],sep="") # eval(parse(text=exec_stmnt)) - dta[,ConstraintGroupSet_Opt] <- dta[,constraints["groups"]] + dta[,"ConstraintGroupSet_Opt"] <- dta[,constraints["groups"]] } else { dta$ConstraintGroupSet_Opt <- 1 From 2c4bb58714ac94da1646966a2b5819c632af6801 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:06:58 -0400 Subject: [PATCH 015/212] Testing. --- R/SAT.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index e40ae58..4f6f178 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -7,6 +7,8 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo drop_method = drop_opts["drop_method"] drop_thresh = as.numeric(drop_opts["drop_thresh"]) + print("sat1") + if (!is.null(constraints)) { for (cst in 1:length(names(constraints))) { if(names(constraints)[cst] == "groups") { @@ -28,7 +30,9 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #max the distance threshold by taking the diagonal of the bounding box. dist_PSM = NULL } - + + print("sat2") + #Caclulate the number of groups to constrain by, if any. group_constraints <- unique(dta$ConstraintGroupSet_Opt) @@ -60,6 +64,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo cnt = cnt + 1 } } + print("sat3") temp_dta <- list() @@ -77,6 +82,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) } } + print("sat4") #Build the final datasets from subsets if (cnt > 1) { @@ -88,6 +94,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo dta <- temp_dta[[1]] } + print("sat5") if (drop_unmatched == TRUE) { dta <- dta[dta@data$PSM_match_ID != -999,] @@ -96,6 +103,8 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo anc_v_int <- strsplit(psm_eq, "~")[[1]][2] anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) anc_vars <- c(anc_vars[[1]], "PSM_trtProb") + + print("sat6") #Drop observations according to the selected method if (drop_method == "SD") { @@ -112,6 +121,8 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #observationalists about causal inference", Imal, King, and Stuart. #Simplest suggestion of comparing means and checking if .25 SD apart used. cnt = 0 + + print("sat7") for (i in 1:length(anc_vars)) { #gsub to remove any factors() @@ -188,6 +199,8 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]","",ed_v) } } + + print("sat8") if (visual=="TRUE") { #Output graphics From 977e68e5fb0aea82da43b0cd3f86d4311464fc67 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:09:47 -0400 Subject: [PATCH 016/212] Testing. --- R/SAT.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 4f6f178..791ebb5 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -11,13 +11,13 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if (!is.null(constraints)) { for (cst in 1:length(names(constraints))) { - if(names(constraints)[cst] == "groups") { + if (names(constraints)[cst] == "groups") { # exec_stmnt = paste("dta$ConstraintGroupSet_Opt <- dta$",constraints["groups"],sep="") # eval(parse(text=exec_stmnt)) - dta[,"ConstraintGroupSet_Opt"] <- dta[,constraints["groups"]] + dta[["ConstraintGroupSet_Opt"]] <- dta[[constraints["groups"]]] } else { - dta$ConstraintGroupSet_Opt <- 1 + dta[["ConstraintGroupSet_Opt"]] <- 1 } if (names(constraints)[cst] == "distance") { dist_PSM = as.numeric(constraints["distance"][[1]]) @@ -26,7 +26,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } } } else { - dta$ConstraintGroupSet_Opt <- 1 + dta[["ConstraintGroupSet_Opt"]] <- 1 #max the distance threshold by taking the diagonal of the bounding box. dist_PSM = NULL } @@ -34,7 +34,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat2") #Caclulate the number of groups to constrain by, if any. - group_constraints <- unique(dta$ConstraintGroupSet_Opt) + group_constraints <- unique(dta[["ConstraintGroupSet_Opt"]]) #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) t_dta <- list() @@ -199,7 +199,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]","",ed_v) } } - + print("sat8") if (visual=="TRUE") { From c5fe38f75a6db91f1509ee48a9b5414cb62ae58b Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:17:35 -0400 Subject: [PATCH 017/212] Testing. --- R/SAT.R | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 791ebb5..13d0093 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -70,9 +70,11 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo for (i in 1:cnt) { cur_grp <- grp_list[[i]] + + print("sat3.1") it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) - + print("sat3.2") if (mtd == "fastNN") { temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) } @@ -143,19 +145,21 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # c_type = eval(parse(text=paste("class(init_dta@data$",ed_v,")"))) c_type = class(init_dta@data[,ed_v]) + print("sat7.1") if (c_type == "matrix") { # exec_str = paste("dta@data$",ed_v,"<- as.numeric(dta@data$",ed_v,")",sep="") # eval(parse(text=exec_str)) - dta@data[,ed_v] <- as.numeric(dta@data[,ed_v]) + dta@data[[ed_v]] <- as.numeric(dta@data[[ed_v]]) # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") # eval(parse(text=exec_str)) - init_dta@data[,ed_v] <- as.numeric(init_dta@data[,ed_v]) + init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]]) c_type = "numeric" } + print("sat7.2") if ((c_type == "numeric") & (visual == "TRUE")) { cnt = cnt + 1 pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) @@ -174,7 +178,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo it_diff_Mean_pre <- round(abs( treat_mean_pre-control_mean_pre ),5) it_diff_Mean_post <- round(abs(treat_mean_post-control_mean_post),5) - if(!exists("bRes")) { + if (!exists("bRes")) { bRes <- data.frame(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, it_diff_Mean_pre,it_diff_Mean_post) @@ -207,9 +211,9 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #Remove the factor rows nrow_c <- length(pltObjs) counter <- 1 - while(counter <= nrow_c) { + while (counter <= nrow_c) { d = counter + 3 - if(d > nrow_c) { + if (d > nrow_c) { d = nrow_c } do.call(grid.arrange,c(pltObjs[counter:d],nrow=2,ncol=2)) From 8726c83f311dde696dd9b7451284e9faedfeff62 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:32:29 -0400 Subject: [PATCH 018/212] Testing. --- R/SAT.R | 78 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 13d0093..1aa2f11 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -46,20 +46,20 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo grp_index = length(grp_list)+1 t_index = length(t_dta)+1 grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] - t_dta[[t_index]] <- dta[dta$TrtBin == 1,] - u_dta[[t_index]] <- dta[dta$TrtBin == 0,] - treatment_count <- cur_grp %in% t_dta[[t_index]]$ConstraintGroupSet_Opt - untreated_count <- cur_grp %in% u_dta[[t_index]]$ConstraintGroupSet_Opt + t_dta[[t_index]] <- dta[dta[["TrtBin"]] == 1,] + u_dta[[t_index]] <- dta[dta[["TrtBin"]] == 0,] + treatment_count <- cur_grp %in% t_dta[[t_index]][["ConstraintGroupSet_Opt"]] + untreated_count <- cur_grp %in% u_dta[[t_index]][["ConstraintGroupSet_Opt"]] if ((untreated_count == FALSE) || (treatment_count == FALSE)) { - dta <- dta[!dta$ConstraintGroupSet_Opt == cur_grp,] + dta <- dta[!dta[["ConstraintGroupSet_Opt"]] == cur_grp,] t_dta[[t_index]] <- NULL u_dta[[t_index]] <- NULL grp_list[[t_index]] <- NULL war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") warning(war_statement) } else { - t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] - u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]][["ConstraintGroupSet_Opt"]] == cur_grp,] + u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]][["ConstraintGroupSet_Opt"]] == cur_grp,] cnt = cnt + 1 } @@ -76,9 +76,10 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat3.2") if (mtd == "fastNN") { + # this is the slow part of functions temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) } - + if (mtd == "NN_WithReplacement") { print("NN with replacement is currently not available, please choose fastNN") # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) @@ -99,7 +100,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat5") if (drop_unmatched == TRUE) { - dta <- dta[dta@data$PSM_match_ID != -999,] + dta <- dta[dta@data[["PSM_match_ID"]] != -999,] } anc_v_int <- strsplit(psm_eq, "~")[[1]][2] @@ -111,11 +112,11 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #Drop observations according to the selected method if (drop_method == "SD") { #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. - psm_sd_thresh = sd(dta$PSM_trtProb) * drop_thresh + psm_sd_thresh = sd(dta[["PSM_trtProb"]]) * drop_thresh if (visual == "TRUE") { print(psm_sd_thresh) } - dta <- dta[dta@data$PSM_distance < psm_sd_thresh,] + dta <- dta[dta@data[["PSM_distance"]] < psm_sd_thresh,] } #Plot the pre and post-dropping balance for PSM model... @@ -127,23 +128,27 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat7") for (i in 1:length(anc_vars)) { + + print("sat7.0") + #gsub to remove any factors() ed_v = sub("factor\\(","",anc_vars[i]) ed_v = sub(")","",ed_v) - treat_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[3]],5)") - treat_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[4]],5)") + + # treat_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[3]],5)") + # treat_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[4]],5)") - control_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[3]],5)") - control_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[4]],5)") + # control_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[3]],5)") + # control_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[4]],5)") - treat_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[3]],5)") - treat_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[4]],5)") + # treat_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[3]],5)") + # treat_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[4]],5)") - control_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[3]],5)") - control_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[4]],5)") + # control_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[3]],5)") + # control_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[4]],5)") # c_type = eval(parse(text=paste("class(init_dta@data$",ed_v,")"))) - c_type = class(init_dta@data[,ed_v]) + c_type = class(init_dta@data[[ed_v]]) print("sat7.1") if (c_type == "matrix") { @@ -165,23 +170,37 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) - treat_mean_pre <- eval(parse(text=treat_mean_pre)) - treat_SD_pre <- eval(parse(text=treat_SD_pre)) - control_mean_pre <- eval(parse(text=control_mean_pre)) - control_SD_pre <- eval(parse(text=control_SD_pre)) + # treat_mean_pre <- eval(parse(text=treat_mean_pre)) + # treat_SD_pre <- eval(parse(text=treat_SD_pre)) + # control_mean_pre <- eval(parse(text=control_mean_pre)) + # control_SD_pre <- eval(parse(text=control_SD_pre)) + + # treat_mean_post <- eval(parse(text=treat_mean_post)) + # treat_SD_post <- eval(parse(text=treat_SD_post)) + # control_mean_post <- eval(parse(text=control_mean_post)) + # control_SD_post <- eval(parse(text=control_SD_post)) + + + treat_mean_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[2]][[3]], 5) + treat_SD_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[2]][[4]], 5) + control_mean_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[1]][[3]], 5) + control_SD_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[1]][[4]], 5) + + treat_mean_post <- round(describeBy(dta@data[[ed_v]], group=dta@data[[TrtBinColName]])[[2]][[3]], 5) + treat_SD_post <- round(describeBy(dta@data[[ed_v]], group=dta@data[[TrtBinColName]])[[2]][[4]], 5) + control_mean_post <- round(describeBy(dta@data[[ed_v]], group=dta@data[[TrtBinColName]])[[1]][[3]], 5) + control_SD_post <- round(describeBy(dta@data[[ed_v]], group=dta@data[[TrtBinColName]])[[1]][[4]], 5) - treat_mean_post <- eval(parse(text=treat_mean_post)) - treat_SD_post <- eval(parse(text=treat_SD_post)) - control_mean_post <- eval(parse(text=control_mean_post)) - control_SD_post <- eval(parse(text=control_SD_post)) it_diff_Mean_pre <- round(abs( treat_mean_pre-control_mean_pre ),5) it_diff_Mean_post <- round(abs(treat_mean_post-control_mean_post),5) if (!exists("bRes")) { + bRes <- data.frame(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, it_diff_Mean_pre,it_diff_Mean_post) + colnames(bRes)[1] <- "Pre-Balance Treated Mean" colnames(bRes)[2] <- "Pre-Balance Treated SD" colnames(bRes)[3] <- "Pre-Balance Control Mean" @@ -194,13 +213,14 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo colnames(bRes)[9] <- "Mean Difference Pre-Balance" colnames(bRes)[10] <- "Mean Difference Post-Balance" + } else { bRes <- rbind(bRes, c(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, it_diff_Mean_pre,it_diff_Mean_post)) } - rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]","",ed_v) + rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]", "", ed_v) } } From 0cee107d7dd53356396f7e536575ed6c592b5437 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 11:56:47 -0400 Subject: [PATCH 019/212] Remove evals from fastNN_binary_func.R. --- R/fastNN_binary_func.R | 232 ++++++++++++++++++++++------------------- 1 file changed, 127 insertions(+), 105 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index a9e1419..73ec549 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -5,120 +5,142 @@ #and then working through this list in order from highest to lowest. #Matches are removed each step. -fastNN_binary_func <- function(dta,trtMntVar,ids,curgrp,dist_PSM) -{ - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data$PSM_trtProb),] - #Conduct the matching - - str_trted <- paste("treated <- sorted_dta[sorted_dta$",trtMntVar, "== 1,]",sep="") - str_untrted <- paste("untreated <- sorted_dta[sorted_dta$",trtMntVar,"==0,]",sep="") - eval(parse(text=str_trted)) - eval(parse(text=str_untrted)) - - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - dta@data$match <- -999 - dta@data$PSM_distance <- -999 - dta@data$PSM_match_ID <- -999 - - #Calculate a distance decay function - #to perturb pairs based on their distances. - for (j in 1:it_cnt) - { - str_trted <- paste("treated <- sorted_dta[sorted_dta$",trtMntVar, "== 1,]",sep="") - str_untrted <- paste("untreated <- sorted_dta[sorted_dta$",trtMntVar,"==0,]",sep="") - eval(parse(text=str_trted)) - eval(parse(text=str_untrted)) - - +fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { + + #Fast nearest neighbors search - will not arrive at optimum, + #but this may not be an issue for many analysis. + #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + sorted_dta <- dta@data[order(dta@data$PSM_trtProb),] - #Run the KNN for all neighbors. - k <- get.knnx(treated$PSM_trtProb, untreated$PSM_trtProb, 1) + + #Conduct the matching + + # str_trted <- paste("treated <- sorted_dta[sorted_dta$",trtMntVar, "== 1,]",sep="") + # str_untrted <- paste("untreated <- sorted_dta[sorted_dta$",trtMntVar,"==0,]",sep="") + # eval(parse(text=str_trted)) + # eval(parse(text=str_untrted)) + + treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] + untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] + + + it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + dta@data[["match"]] <- -999 + dta@data[["PSM_distance"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 + + #Calculate a distance decay function + #to perturb pairs based on their distances. + for (j in 1:it_cnt) { + + # str_trted <- paste("treated <- sorted_dta[sorted_dta$",trtMntVar, "== 1,]",sep="") + # str_untrted <- paste("untreated <- sorted_dta[sorted_dta$",trtMntVar,"==0,]",sep="") + # eval(parse(text=str_trted)) + # eval(parse(text=str_untrted)) - #Perturb the values based on the distance decay function, if selected. - if(!is.null(dist_PSM)) - { - for(mC in 1:length(k[[1]])) - { - #Calculate the Euclidean Distance between pairs - cid_txt = paste("untreated$",ids,"[",mC,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) - - mT = k$nn.index[mC] - - tid_txt = paste("treated$",ids,"[",mT,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) - - #Find the control x,y location - cCoord_e = paste("coordinates(dta[which(dta@data$",ids," == Control_ID),])", sep="") - cCoord = eval(parse(text=cCoord_e)) + treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] + untreated <- sorted_dta[sorted_dta[[trtMntVar]] ==0,] - - #Find the treatment x,y location - tCoord_e = paste("coordinates(dta[which(dta@data$",ids," == Treatment_ID),])", sep="") - tCoord = eval(parse(text=tCoord_e)) - y_dist = abs(cCoord[1] - cCoord[2]) - x_dist = abs(tCoord[1] - tCoord[2]) - euc_dist = sqrt(y_dist^2 + x_dist^2) + #Run the KNN for all neighbors. + k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - PSM_score = k$nn.dist[mC] - geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + #Perturb the values based on the distance decay function, if selected. + if (!is.null(dist_PSM)) { + for (mC in 1:length(k[[1]])) { + #Calculate the Euclidean Distance between pairs + # cid_txt = paste("untreated$",ids,"[",mC,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) + Control_ID = toString(untreated[[ids]][[mC]]) + mT = k$nn.index[mC] + + # tid_txt = paste("treated$",ids,"[",mT,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) + Treatment_ID = toString(treated[[ids]][[mT]]) - - k$nn.dist[mC] <- ((1-geog_Weight) * PSM_score) + #Find the control x,y location + # cCoord_e = paste("coordinates(dta[which(dta@data$",ids," == Control_ID),])", sep="") + # cCoord = eval(parse(text=cCoord_e)) + cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + + + #Find the treatment x,y location + # tCoord_e = paste("coordinates(dta[which(dta@data$",ids," == Treatment_ID),])", sep="") + # tCoord = eval(parse(text=tCoord_e)) + tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + + y_dist = abs(cCoord[1] - cCoord[2]) + x_dist = abs(tCoord[1] - tCoord[2]) + euc_dist = sqrt(y_dist^2 + x_dist^2) + + PSM_score = k$nn.dist[mC] + geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - } + + + k$nn.dist[mC] <- ((1-geog_Weight) * PSM_score) + + } - } - - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k$nn.dist %in% sort(k$nn.dist)[1]) - - #This will give us the matched index in the "treated" dataset. - best_m_treated = k$nn.index[best_m_control] + } - #Control PSM ID - cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) + #Add the matched treatment and control values to the recording data frame + #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + best_m_control = which(k$nn.dist %in% sort(k$nn.dist)[1]) - #Treatment PSM ID - tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) - - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j,sep="") - - #Add the Treatment ID to the Control Row - tid_a_1 = paste("dta@data$match[which(dta@data$",ids," == Control_ID)] = Treatment_ID", sep="") - tid_a_2 = paste("dta@data$PSM_distance[which(dta@data$",ids," == Control_ID)] = k$nn.dist[,1][best_m_control]",sep="") - tid_a_3 = paste("dta@data$PSM_match_ID[which(dta@data$",ids," == Control_ID)] = pair_id", sep="") - eval(parse(text=tid_a_1)) - eval(parse(text=tid_a_2)) - eval(parse(text=tid_a_3)) - - - - #Add the Control ID to the Treatment Row - cid_a_1 = paste("dta@data$match[which(dta@data$",ids," == Treatment_ID)] = Control_ID", sep="") - cid_a_2 = paste("dta@data$PSM_distance[which(dta@data$",ids," == Treatment_ID)] = k$nn.dist[,1][best_m_control]", sep="") - cid_a_3 = paste("dta@data$PSM_match_ID[which(dta@data$",ids," == Treatment_ID)] = pair_id", sep="") - eval(parse(text=cid_a_1)) - eval(parse(text=cid_a_2)) - eval(parse(text=cid_a_3)) - - - #Drop the paired match out of the iteration matrix - did_a_1 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Treatment_ID ,]",sep="") - did_a_2 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Control_ID ,]",sep="") - eval(parse(text=did_a_1)) - eval(parse(text=did_a_2)) - + #This will give us the matched index in the "treated" dataset. + best_m_treated = k$nn.index[best_m_control] + + #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) + Control_ID = toString(untreated[[ids]][[best_m_control]]) + + #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) + Treatment_ID = toString(treated[[ids]][[best_m_treated]]) + + + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + pair_id = paste(curgrp,j,sep="") + + + #Add the Treatment ID to the Control Row + # tid_a_1 = paste("dta@data$match[which(dta@data$",ids," == Control_ID)] = Treatment_ID", sep="") + # tid_a_2 = paste("dta@data$PSM_distance[which(dta@data$",ids," == Control_ID)] = k$nn.dist[,1][best_m_control]",sep="") + # tid_a_3 = paste("dta@data$PSM_match_ID[which(dta@data$",ids," == Control_ID)] = pair_id", sep="") + # eval(parse(text=tid_a_1)) + # eval(parse(text=tid_a_2)) + # eval(parse(text=tid_a_3)) + dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID + dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID)] = k$nn.dist[,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID)] = pair_id + + + #Add the Control ID to the Treatment Row + # cid_a_1 = paste("dta@data$match[which(dta@data$",ids," == Treatment_ID)] = Control_ID", sep="") + # cid_a_2 = paste("dta@data$PSM_distance[which(dta@data$",ids," == Treatment_ID)] = k$nn.dist[,1][best_m_control]", sep="") + # cid_a_3 = paste("dta@data$PSM_match_ID[which(dta@data$",ids," == Treatment_ID)] = pair_id", sep="") + # eval(parse(text=cid_a_1)) + # eval(parse(text=cid_a_2)) + # eval(parse(text=cid_a_3)) + dta@data$match[which(dta@data[[ids]] == Treatment_ID)] = Control_ID + dta@data$PSM_distance[which(dta@data[[ids]] == Treatment_ID)] = k$nn.dist[,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Treatment_ID)] = pair_id + + #Drop the paired match out of the iteration matrix + # did_a_1 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Treatment_ID ,]",sep="") + # did_a_2 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Control_ID ,]",sep="") + # eval(parse(text=did_a_1)) + # eval(parse(text=did_a_2)) + sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - } - return(dta) -} \ No newline at end of file + } + + return(dta) + +} + From 0ca43a03773291c5aa4af3c5da77ee0ca366f4ac Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 12:08:26 -0400 Subject: [PATCH 020/212] Testing. --- R/BuildTimeSeries.R | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index a91cd2e..ae58a99 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -53,11 +53,16 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # If there is an "interpVars" variable, linearly interpolate values based on at least 2 known points in time. if (!is.null(interpYears)) { + print("bts3.0") for (AncInt in 1:length(interpYears)) { + + print("bts3.0.0") cur_ancVi <- interpYears[AncInt] interpFrame <- dta@data[idField] interpFrame[idField] <- dta@data[idField] cnt = 2 + + print("bts3.0.1") for (k in 1:length(years)) { # First, build a model describing the relationship between years and any data in the interp field. varI <- paste(cur_ancVi,years[[k]],sep="") @@ -68,7 +73,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # add_data <- paste("interpFrame[cnt] <- dta@data$",varI) # eval(parse(text=add_data)) - interpFrame[cnt] <- dta@data[,varI] + interpFrame[cnt] <- dta@data[[varI]] colnames(interpFrame)[cnt] <- years[[k]] @@ -81,7 +86,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # add_data <- paste("interpFrame[cnt] <- dta@data$",varC) # eval(parse(text=add_data)) - interpFrame[cnt] <- dta@data[,varC] + interpFrame[cnt] <- dta@data[[varC]] cnt = 3 @@ -90,6 +95,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } + print("bts3.0.2") # Only one time point, so no interpolation is done - value is simply copied to all other columns. if (cnt == 3) { @@ -97,7 +103,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # varI <- paste("dta@data$",cur_ancVi,years[[k]]," <- interpFrame[2]",sep="") # eval(parse(text=varI)) - dta@data[,cur_ancVi,years[[k]]] <- interpFrame[2] + dta@data[[paste(cur_ancVi,years[[k]]], sep="")]] <- interpFrame[2] } } else { @@ -130,7 +136,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } } - + + print("bts3.1") # Append interpolated fields to our melting lists for (v in 1:length(interpYears)) { varList_pre[[length(varList_pre)+1]] <- interpYears[v] From 045121118a169dae91c6af9bd575b07e396ae1ea Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 12:08:56 -0400 Subject: [PATCH 021/212] Testing. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index ae58a99..d17182a 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -103,7 +103,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # varI <- paste("dta@data$",cur_ancVi,years[[k]]," <- interpFrame[2]",sep="") # eval(parse(text=varI)) - dta@data[[paste(cur_ancVi,years[[k]]], sep="")]] <- interpFrame[2] + dta@data[[paste(cur_ancVi,years[[k]], sep="")]] <- interpFrame[2] } } else { From eeff4db99def66f50064372b8d30902dc899180d Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 12:37:58 -0400 Subject: [PATCH 022/212] Remove unnecessary interpolation from buildtimeseries func. --- R/BuildTimeSeries.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index d17182a..ea88f12 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -96,6 +96,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } print("bts3.0.2") + # this is a slow part + # Only one time point, so no interpolation is done - value is simply copied to all other columns. if (cnt == 3) { @@ -106,14 +108,17 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY dta@data[[paste(cur_ancVi,years[[k]], sep="")]] <- interpFrame[2] } - } else { + } else if (cnt < length(years) + 2) { + tDframe <- dta@data[idField] # Here, we model out everything. + # Melt the dataframe for modeling melt_Model_dta <- melt(data.frame(interpFrame),id=idField) melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta$variable)) - + + # Fit the model for interpolation execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)",sep="") @@ -132,9 +137,9 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY dta@data[varI] <- predict(mdl,newdata=tDframe) } - } + } } - + } print("bts3.1") From f68e73eb6fc568bb3228d0b0335688881d3856c0 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 13:52:55 -0400 Subject: [PATCH 023/212] Update. --- R/BuildTimeSeries.R | 27 +++++++++++++-------------- R/SAT.R | 1 + R/SpatialCausalPSM.R | 18 +++++++++--------- R/loadLibs.R | 5 ++--- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index ea88f12..a9f5d6c 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -14,10 +14,10 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY for (k in 1:length(years)) { for (j in 1:length(colYears)) { - varN <- paste("TrtMnt_", colYears[j], years[k], sep="") + varN <- paste("TrtMnt_",colYears[j],years[k], sep="") print(varN) - exec <- paste("dta$",varN,"=0",sep="") + exec <- paste("dta$",varN,"=0", sep="") eval(parse(text=exec)) # dta[,varN] = 0 @@ -37,8 +37,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY timer <- proc.time() for (j in 1:length(colYears)) { - trt_id = paste("TrtMnt_",colYears[j],sep="") - interpYears <- c(interpYears,trt_id) + trt_id = paste("TrtMnt_",colYears[j], sep="") + interpYears <- c(interpYears, trt_id) } print(interpYears) @@ -65,7 +65,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts3.0.1") for (k in 1:length(years)) { # First, build a model describing the relationship between years and any data in the interp field. - varI <- paste(cur_ancVi,years[[k]],sep="") + varI <- paste(cur_ancVi,years[[k]], sep="") print(varI) # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. if (varI %in% colnames(dta@data)) { @@ -80,7 +80,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY cnt = cnt + 1 } else { # Exception for a single-point interpolation - varC <- paste(cur_ancVi,sep="") + varC <- paste(cur_ancVi, sep="") if (varC %in% colnames(dta@data)) { @@ -115,26 +115,25 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Here, we model out everything. # Melt the dataframe for modeling - melt_Model_dta <- melt(data.frame(interpFrame),id=idField) + melt_Model_dta <- melt(data.frame(interpFrame), id=idField) melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta$variable)) # Fit the model for interpolation - execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)",sep="") + execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)", sep="") eval(parse(text=execstr)) - # mdl <- lm(value ~ variable + factor(idField), data=melt_Model_dta) # Apply the model to interpolate for (u in 1:length(years)) { - varI <- paste(cur_ancVi,years[[u]],sep="") + varI <- paste(cur_ancVi,years[[u]], sep="") if (!(varI %in% colnames(dta@data))) { # Variable doesn't exist, so we need to interpolate. tDframe[idField] <- dta@data[idField] tDframe["variable"] <- years[[u]] - dta@data[varI] <- predict(mdl,newdata=tDframe) + dta@data[varI] <- predict(mdl, newdata=tDframe) } } @@ -167,11 +166,11 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY grepStrYrs = idField for (j in 1:length(years)) { tempGrep <- grepStrYrs - grepStrYrs <- paste(tempGrep,"|",varList_pre[[i]],years[[j]],sep="") + grepStrYrs <- paste(tempGrep,"|",varList_pre[[i]],years[[j]], sep="") } - tDF <- dta@data[grepl(grepStrYrs,names(dta@data))] - meltList[[i]] <- melt(tDF,id=idField) + tDF <- dta@data[grepl(grepStrYrs, names(dta@data))] + meltList[[i]] <- melt(tDF, id=idField) # Keep only years in the year column, rename columns colnames(meltList[[i]])[2] <- "Year" diff --git a/R/SAT.R b/R/SAT.R index 1aa2f11..d9d3a71 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -76,6 +76,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat3.2") if (mtd == "fastNN") { + # *** # this is the slow part of functions temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) } diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 4a284c4..4ba9ef2 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -15,7 +15,7 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { retData <- dta # predict values based on model - retData$PSM_trtProb <- predict(PSMfit, dta@data, type="response") + retData[["PSM_trtProb"]] <- predict(PSMfit, dta@data, type="response") if (visual == "TRUE") { @@ -29,13 +29,13 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { if (drop == "support") { # Drop - treated <- retData@data[retData@data$TrtBin == 1,] - untreated <- retData@data[retData@data$TrtBin == 0,] - min_cut <- max(min(treated$PSM_trtProb), min(untreated$PSM_trtProb)) - max_cut <- min(max(treated$PSM_trtProb), max(untreated$PSM_trtProb)) + treated <- retData@data[retData@data[["TrtBin"]] == 1,] + untreated <- retData@data[retData@data[["TrtBin"]] == 0,] + min_cut <- max(min(treated[["PSM_trtProb"]]), min(untreated[["PSM_trtProb"]])) + max_cut <- min(max(treated[["PSM_trtProb"]]), max(untreated[["PSM_trtProb"]])) - retData <- retData[retData@data$PSM_trtProb >= min_cut,] - retData <- retData[retData@data$PSM_trtProb <= max_cut,] + retData <- retData[retData@data[["PSM_trtProb"]] >= min_cut,] + retData <- retData[retData@data[["PSM_trtProb"]] <= max_cut,] } @@ -50,7 +50,7 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { # return original and predicted data along with model retEle <- 0 - retEle$data <- retData - retEle$model <- PSMfit + retEle[["data"]] <- retData + retEle[["model"]] <- PSMfit return (retEle) } diff --git a/R/loadLibs.R b/R/loadLibs.R index 8cf97ca..2eb2d52 100644 --- a/R/loadLibs.R +++ b/R/loadLibs.R @@ -1,6 +1,5 @@ #Library loading script in case dependencies are not loading correctly. -loadLibs <- function (x=1) - { +loadLibs <- function (x=1) { require(sp) #require(GISTools) # sudo apt-get install libgeos-dev require(maptools) @@ -20,4 +19,4 @@ loadLibs <- function (x=1) library(stargazer) library(lmtest) library(multiwayvcov) - } \ No newline at end of file +} From 2bc1f94f540d7cda3130923ffee6422b915f6825 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 14:49:06 -0400 Subject: [PATCH 024/212] Clean up. --- R/BuildTimeSeries.R | 2 +- R/Stage2PSM.R | 140 ++++++++++++++++++++--------------------- R/fastNN_binary_func.R | 20 +++--- 3 files changed, 78 insertions(+), 84 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index a9f5d6c..038f789 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -116,7 +116,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Melt the dataframe for modeling melt_Model_dta <- melt(data.frame(interpFrame), id=idField) - melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta$variable)) + melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta[["variable"]])) # Fit the model for interpolation diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index a0054b4..7fcaa92 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -3,82 +3,76 @@ #These functions are to make common modeling strategies easier to specify for users #That do not write their own models. -Stage2PSM <- function(model, dta, type, table_out = NULL, opts = NULL) -{ +Stage2PSM <- function(model, dta, type, table_out = NULL, opts = NULL) { - ret_var <- list() + ret_var <- list() - if(type == "lm") - { - m_fit <- lm(model,dta) - print("==========================") - print("UNSTANDARDIZED MODEL RESULTS") - print("==========================") - #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") - print(summary(m_fit)) - ret_var$unstandardized <- lm(model,dta) - texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)",custom.model.names="Unstandardized Model",custom.note=model) - - if(!is.null(table_out)) - { - dta_tmp <- dta - - if(class(dta) == "data.frame") - { - d_index <- sapply(dta_tmp, is.numeric) - dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) - } else { - d_index <- sapply(dta_tmp@data, is.numeric) - dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) - } - dta_fit_std <- lm(model,dta_tmp) - ret_var$standardized <- lm(model,dta_tmp) - print("==========================") - print("STANDARDIZED MODEL RESULTS") - print("==========================") - print(summary(dta_fit_std)) - texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)",custom.model.names="Standardized Model", custom.note=model) + if(type == "lm") { + m_fit <- lm(model,dta) + print("==========================") + print("UNSTANDARDIZED MODEL RESULTS") + print("==========================") + #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") + print(summary(m_fit)) + ret_var[["unstandardized"]] <- lm(model,dta) + texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)",custom.model.names="Unstandardized Model",custom.note=model) + + if(!is.null(table_out)) { + dta_tmp <- dta - } + if(class(dta) == "data.frame") { + d_index <- sapply(dta_tmp, is.numeric) + dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) + } else { + d_index <- sapply(dta_tmp@data, is.numeric) + dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) + } + + dta_fit_std <- lm(model,dta_tmp) + ret_var[["standardized"]] <- lm(model,dta_tmp) + print("==========================") + print("STANDARDIZED MODEL RESULTS") + print("==========================") + print(summary(dta_fit_std)) + texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)",custom.model.names="Standardized Model", custom.note=model) + + } - } + } - if(type == "cmreg") - { - m_fit <- lm(model,dta) - ret_var$unstandardized <- m_fit - #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") - #print.htmlTable(mTab) - print(summary(m_fit)) - texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Unstandardized Model",custom.note=model) - - if(!is.null(table_out)) - { - dta_tmp <- dta - - if(class(dta) == "data.frame") - { - d_index <- sapply(dta_tmp, is.numeric) - dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) - } else { - d_index <- sapply(dta_tmp@data, is.numeric) - dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) - } - dta_fit_std <- lm(model,dta_tmp) - ret_var$standardized <- dta_fit_std - print(summary(dta_fit_std)) - texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Standardized Model", custom.note=model) - + if(type == "cmreg") { + m_fit <- lm(model,dta) + ret_var[["unstandardized"]] <- m_fit + #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") + #print.htmlTable(mTab) + print(summary(m_fit)) + texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Unstandardized Model",custom.note=model) + + if(!is.null(table_out)) { + dta_tmp <- dta + + if(class(dta) == "data.frame") { + d_index <- sapply(dta_tmp, is.numeric) + dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) + } else { + d_index <- sapply(dta_tmp@data, is.numeric) + dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) + } + dta_fit_std <- lm(model,dta_tmp) + ret_var[["standardized"]] <- dta_fit_std + print(summary(dta_fit_std)) + texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Standardized Model", custom.note=model) + + } + + print(opts) + exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") + m_fit[["var"]] <- eval(parse(text=exec)) + CMREG <- coeftest(m_fit,m_fit[["var"]]) + print("cmReg:") + print(CMREG) + ret_var[["cmreg"]] <- CMREG + } - - print(opts) - exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") - m_fit$var <- eval(parse(text=exec)) - CMREG <- coeftest(m_fit,m_fit$var) - print("cmReg:") - print(CMREG) - ret_var$cmreg <- CMREG - - } - return(ret_var) -} \ No newline at end of file + return(ret_var) +} diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 73ec549..2ea0519 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -10,7 +10,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data$PSM_trtProb),] + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]),] #Conduct the matching @@ -53,7 +53,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Control_ID = toString(eval(parse(text=cid_txt))) Control_ID = toString(untreated[[ids]][[mC]]) - mT = k$nn.index[mC] + mT = k[["nn.index"]][mC] # tid_txt = paste("treated$",ids,"[",mT,"]",sep="") # Treatment_ID = toString(eval(parse(text=tid_txt))) @@ -74,12 +74,12 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { x_dist = abs(tCoord[1] - tCoord[2]) euc_dist = sqrt(y_dist^2 + x_dist^2) - PSM_score = k$nn.dist[mC] + PSM_score = k[["nn.dist"]][mC] geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - k$nn.dist[mC] <- ((1-geog_Weight) * PSM_score) + k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) } @@ -87,10 +87,10 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Add the matched treatment and control values to the recording data frame #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k$nn.dist %in% sort(k$nn.dist)[1]) + best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) #This will give us the matched index in the "treated" dataset. - best_m_treated = k$nn.index[best_m_control] + best_m_treated = k[["nn.index"]][best_m_control] #Control PSM ID # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") @@ -115,7 +115,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # eval(parse(text=tid_a_2)) # eval(parse(text=tid_a_3)) dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID - dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID)] = k$nn.dist[,1][best_m_control] + dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID)] = k[["nn.dist"]][,1][best_m_control] dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID)] = pair_id @@ -126,9 +126,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # eval(parse(text=cid_a_1)) # eval(parse(text=cid_a_2)) # eval(parse(text=cid_a_3)) - dta@data$match[which(dta@data[[ids]] == Treatment_ID)] = Control_ID - dta@data$PSM_distance[which(dta@data[[ids]] == Treatment_ID)] = k$nn.dist[,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Treatment_ID)] = pair_id + dta@data[["match"]][which(dta@data[[ids]] == Treatment_ID)] = Control_ID + dta@data[["PSM_distance"]][which(dta@data[[ids]] == Treatment_ID)] = k[["nn.dist"]][,1][best_m_control] + dta@data[["PSM_match_ID"]][which(dta@data[[ids]] == Treatment_ID)] = pair_id #Drop the paired match out of the iteration matrix # did_a_1 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Treatment_ID ,]",sep="") From 3b3d4b8283d974903ee42ed28c149d72a07a3d71 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 14:53:18 -0400 Subject: [PATCH 025/212] Clean up. --- R/BuildTimeSeries.R | 21 +-------------------- R/SAT.R | 29 ---------------------------- R/fastNN_binary_func.R | 43 +----------------------------------------- 3 files changed, 2 insertions(+), 91 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 038f789..4425749 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -2,8 +2,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY years <- startYear:endYear - - print("bts1") timer <- proc.time() @@ -15,11 +13,10 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY for (j in 1:length(colYears)) { varN <- paste("TrtMnt_",colYears[j],years[k], sep="") - print(varN) + exec <- paste("dta$",varN,"=0", sep="") eval(parse(text=exec)) - # dta[,varN] = 0 @@ -32,7 +29,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(paste("section completed in", timer[3], "seconds.")) - print("bts2") timer <- proc.time() @@ -47,7 +43,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(paste("section completed in", timer[3], "seconds.")) - print("bts3") timer <- proc.time() @@ -70,12 +65,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. if (varI %in% colnames(dta@data)) { - - # add_data <- paste("interpFrame[cnt] <- dta@data$",varI) - # eval(parse(text=add_data)) interpFrame[cnt] <- dta@data[[varI]] - colnames(interpFrame)[cnt] <- years[[k]] cnt = cnt + 1 } else { @@ -83,16 +74,11 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varC <- paste(cur_ancVi, sep="") if (varC %in% colnames(dta@data)) { - - # add_data <- paste("interpFrame[cnt] <- dta@data$",varC) - # eval(parse(text=add_data)) interpFrame[cnt] <- dta@data[[varC]] - cnt = 3 } } - } print("bts3.0.2") @@ -102,9 +88,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (cnt == 3) { for (k in 1:length(years)) { - - # varI <- paste("dta@data$",cur_ancVi,years[[k]]," <- interpFrame[2]",sep="") - # eval(parse(text=varI)) dta@data[[paste(cur_ancVi,years[[k]], sep="")]] <- interpFrame[2] } @@ -188,12 +171,10 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } - timer <- proc.time() - timer print(paste("section completed in", timer[3], "seconds.")) - # Finish up with a cherry on top meltListRet <- data.frame(meltList) diff --git a/R/SAT.R b/R/SAT.R index d9d3a71..dffaf8e 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -12,8 +12,6 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if (!is.null(constraints)) { for (cst in 1:length(names(constraints))) { if (names(constraints)[cst] == "groups") { - # exec_stmnt = paste("dta$ConstraintGroupSet_Opt <- dta$",constraints["groups"],sep="") - # eval(parse(text=exec_stmnt)) dta[["ConstraintGroupSet_Opt"]] <- dta[[constraints["groups"]]] } else { @@ -136,30 +134,13 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo ed_v = sub("factor\\(","",anc_vars[i]) ed_v = sub(")","",ed_v) - # treat_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[3]],5)") - # treat_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[4]],5)") - - # control_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[3]],5)") - # control_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[4]],5)") - - # treat_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[3]],5)") - # treat_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[4]],5)") - - # control_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[3]],5)") - # control_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[4]],5)") - - # c_type = eval(parse(text=paste("class(init_dta@data$",ed_v,")"))) c_type = class(init_dta@data[[ed_v]]) print("sat7.1") if (c_type == "matrix") { - # exec_str = paste("dta@data$",ed_v,"<- as.numeric(dta@data$",ed_v,")",sep="") - # eval(parse(text=exec_str)) dta@data[[ed_v]] <- as.numeric(dta@data[[ed_v]]) - # exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") - # eval(parse(text=exec_str)) init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]]) c_type = "numeric" @@ -171,16 +152,6 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) - # treat_mean_pre <- eval(parse(text=treat_mean_pre)) - # treat_SD_pre <- eval(parse(text=treat_SD_pre)) - # control_mean_pre <- eval(parse(text=control_mean_pre)) - # control_SD_pre <- eval(parse(text=control_SD_pre)) - - # treat_mean_post <- eval(parse(text=treat_mean_post)) - # treat_SD_post <- eval(parse(text=treat_SD_post)) - # control_mean_post <- eval(parse(text=control_mean_post)) - # control_SD_post <- eval(parse(text=control_SD_post)) - treat_mean_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[2]][[3]], 5) treat_SD_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[2]][[4]], 5) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 2ea0519..7bd8408 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -14,16 +14,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Conduct the matching - - # str_trted <- paste("treated <- sorted_dta[sorted_dta$",trtMntVar, "== 1,]",sep="") - # str_untrted <- paste("untreated <- sorted_dta[sorted_dta$",trtMntVar,"==0,]",sep="") - # eval(parse(text=str_trted)) - # eval(parse(text=str_untrted)) - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) dta@data[["match"]] <- -999 dta@data[["PSM_distance"]] <- -999 @@ -33,15 +26,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #to perturb pairs based on their distances. for (j in 1:it_cnt) { - # str_trted <- paste("treated <- sorted_dta[sorted_dta$",trtMntVar, "== 1,]",sep="") - # str_untrted <- paste("untreated <- sorted_dta[sorted_dta$",trtMntVar,"==0,]",sep="") - # eval(parse(text=str_trted)) - # eval(parse(text=str_untrted)) - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] untreated <- sorted_dta[sorted_dta[[trtMntVar]] ==0,] - #Run the KNN for all neighbors. k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) @@ -49,25 +36,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { if (!is.null(dist_PSM)) { for (mC in 1:length(k[[1]])) { #Calculate the Euclidean Distance between pairs - # cid_txt = paste("untreated$",ids,"[",mC,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) Control_ID = toString(untreated[[ids]][[mC]]) mT = k[["nn.index"]][mC] - # tid_txt = paste("treated$",ids,"[",mT,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) - Treatment_ID = toString(treated[[ids]][[mT]]) + Treatment_ID = toString(treated[[ids]][[mT]]) #Find the control x,y location - # cCoord_e = paste("coordinates(dta[which(dta@data$",ids," == Control_ID),])", sep="") - # cCoord = eval(parse(text=cCoord_e)) cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) #Find the treatment x,y location - # tCoord_e = paste("coordinates(dta[which(dta@data$",ids," == Treatment_ID),])", sep="") - # tCoord = eval(parse(text=tCoord_e)) tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) y_dist = abs(cCoord[1] - cCoord[2]) @@ -93,13 +72,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { best_m_treated = k[["nn.index"]][best_m_control] #Control PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) Control_ID = toString(untreated[[ids]][[best_m_control]]) #Treatment PSM ID - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) Treatment_ID = toString(treated[[ids]][[best_m_treated]]) @@ -108,33 +83,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Add the Treatment ID to the Control Row - # tid_a_1 = paste("dta@data$match[which(dta@data$",ids," == Control_ID)] = Treatment_ID", sep="") - # tid_a_2 = paste("dta@data$PSM_distance[which(dta@data$",ids," == Control_ID)] = k$nn.dist[,1][best_m_control]",sep="") - # tid_a_3 = paste("dta@data$PSM_match_ID[which(dta@data$",ids," == Control_ID)] = pair_id", sep="") - # eval(parse(text=tid_a_1)) - # eval(parse(text=tid_a_2)) - # eval(parse(text=tid_a_3)) dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID)] = k[["nn.dist"]][,1][best_m_control] dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID)] = pair_id #Add the Control ID to the Treatment Row - # cid_a_1 = paste("dta@data$match[which(dta@data$",ids," == Treatment_ID)] = Control_ID", sep="") - # cid_a_2 = paste("dta@data$PSM_distance[which(dta@data$",ids," == Treatment_ID)] = k$nn.dist[,1][best_m_control]", sep="") - # cid_a_3 = paste("dta@data$PSM_match_ID[which(dta@data$",ids," == Treatment_ID)] = pair_id", sep="") - # eval(parse(text=cid_a_1)) - # eval(parse(text=cid_a_2)) - # eval(parse(text=cid_a_3)) dta@data[["match"]][which(dta@data[[ids]] == Treatment_ID)] = Control_ID dta@data[["PSM_distance"]][which(dta@data[[ids]] == Treatment_ID)] = k[["nn.dist"]][,1][best_m_control] dta@data[["PSM_match_ID"]][which(dta@data[[ids]] == Treatment_ID)] = pair_id #Drop the paired match out of the iteration matrix - # did_a_1 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Treatment_ID ,]",sep="") - # did_a_2 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Control_ID ,]",sep="") - # eval(parse(text=did_a_1)) - # eval(parse(text=did_a_2)) sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] From f53f7f785c721367bb50b974ce8cf2ce7a2ba8ad Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 4 Sep 2015 15:42:35 -0400 Subject: [PATCH 026/212] Clean up. --- R/Stage2PSM.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 7fcaa92..a9567ba 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -3,24 +3,24 @@ #These functions are to make common modeling strategies easier to specify for users #That do not write their own models. -Stage2PSM <- function(model, dta, type, table_out = NULL, opts = NULL) { +Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { ret_var <- list() - if(type == "lm") { + if (type == "lm") { m_fit <- lm(model,dta) print("==========================") print("UNSTANDARDIZED MODEL RESULTS") print("==========================") #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") print(summary(m_fit)) - ret_var[["unstandardized"]] <- lm(model,dta) - texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)",custom.model.names="Unstandardized Model",custom.note=model) + ret_var[["unstandardized"]] <- lm(model, dta) + texreg::plotreg(m_fit, omit.coef="(match)|(Intercept)", custom.model.names="Unstandardized Model", custom.note=model) - if(!is.null(table_out)) { + if (!is.null(table_out)) { dta_tmp <- dta - if(class(dta) == "data.frame") { + if (class(dta) == "data.frame") { d_index <- sapply(dta_tmp, is.numeric) dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) } else { @@ -40,7 +40,7 @@ Stage2PSM <- function(model, dta, type, table_out = NULL, opts = NULL) { } - if(type == "cmreg") { + if (type == "cmreg") { m_fit <- lm(model,dta) ret_var[["unstandardized"]] <- m_fit #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") @@ -48,10 +48,10 @@ Stage2PSM <- function(model, dta, type, table_out = NULL, opts = NULL) { print(summary(m_fit)) texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Unstandardized Model",custom.note=model) - if(!is.null(table_out)) { + if (!is.null(table_out)) { dta_tmp <- dta - if(class(dta) == "data.frame") { + if( class(dta) == "data.frame") { d_index <- sapply(dta_tmp, is.numeric) dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) } else { From 94a654643f0a902f521f34e25e725bfe440f4ac4 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 11 Sep 2015 11:35:34 -0400 Subject: [PATCH 027/212] Update how timeRangeTrend finds year. --- R/fastNN_binary_func.R | 2 +- R/timeRangeTrend.R | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 7bd8408..fe2e52c 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -40,7 +40,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { mT = k[["nn.index"]][mC] - Treatment_ID = toString(treated[[ids]][[mT]]) + Treatment_ID = toString(treated[[ids]][[mT]]) #Find the control x,y location cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index 85f4afa..fbe7a5e 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -11,10 +11,14 @@ timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield) { analysisDF <- melt(tDF, id=c(IDfield)) # cleaned GREP - remove year digit placeholders - new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) + # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) + + # get location of year in prefix + yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) # generate new year field by removing prefix from variable (original column names) - analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) + # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) + analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) # keep years in range specified analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] From d9227e4cd1cdae58ca9072c0e61a2d32f39d75ff Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 11 Sep 2015 16:31:13 -0400 Subject: [PATCH 028/212] Update how timeRangeTrend handles NA data. --- R/BuildTimeSeries.R | 39 ++++++++++++++++++++++++--------------- R/timeRangeTrend.R | 25 +++++++++++++++++++------ 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 4425749..7ca5721 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -32,6 +32,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts2") timer <- proc.time() + + # add the "TrtMnt_" + colYears[j] prefix to interpYears for (j in 1:length(colYears)) { trt_id = paste("TrtMnt_",colYears[j], sep="") interpYears <- c(interpYears, trt_id) @@ -60,24 +62,21 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts3.0.1") for (k in 1:length(years)) { # First, build a model describing the relationship between years and any data in the interp field. - varI <- paste(cur_ancVi,years[[k]], sep="") - print(varI) + # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. + varI <- gsub('####', years[[k]], cur_ancVi) if (varI %in% colnames(dta@data)) { interpFrame[cnt] <- dta@data[[varI]] - colnames(interpFrame)[cnt] <- years[[k]] cnt = cnt + 1 - } else { - # Exception for a single-point interpolation - varC <- paste(cur_ancVi, sep="") - if (varC %in% colnames(dta@data)) { - interpFrame[cnt] <- dta@data[[varC]] + } else if (cur_ancVi %in% colnames(dta@data)) { - cnt = 3 - } + # Exception for a single-point interpolation + interpFrame[cnt] <- dta@data[[cur_ancVi]] + cnt = 3 + } } @@ -88,7 +87,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (cnt == 3) { for (k in 1:length(years)) { - dta@data[[paste(cur_ancVi,years[[k]], sep="")]] <- interpFrame[2] + # add _year to end of non temporal data + dta@data[[paste(cur_ancVi,years[[k]],sep="_")]] <- interpFrame[2] } } else if (cnt < length(years) + 2) { @@ -99,7 +99,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Melt the dataframe for modeling melt_Model_dta <- melt(data.frame(interpFrame), id=idField) - melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta[["variable"]])) + melt_Model_dta["variable"] <- as.numeric(gsub("X", "", melt_Model_dta[["variable"]])) # Fit the model for interpolation @@ -111,7 +111,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Apply the model to interpolate for (u in 1:length(years)) { - varI <- paste(cur_ancVi,years[[u]], sep="") + + varI <- gsub('####', years[[u]], cur_ancVi) if (!(varI %in% colnames(dta@data))) { # Variable doesn't exist, so we need to interpolate. tDframe[idField] <- dta@data[idField] @@ -144,13 +145,21 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY meltList <- list() for (i in 1:length(varList_pre)) { - # grep_str = paste(idField,"|",varList_pre[i],"[0-9][0-9][0-9][0-9]",sep="") + # Limit to only relevant years grepStrYrs = idField + for (j in 1:length(years)) { tempGrep <- grepStrYrs - grepStrYrs <- paste(tempGrep,"|",varList_pre[[i]],years[[j]], sep="") + + if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { + grepStrYrs <- paste(tempGrep,"|",paste(varList_pre[[i]]],years[[j]], sep="_"), sep="") + } else { + grepStrYrs <- paste(tempGrep,"|",gsub('####', years[[j]], varList_pre[[i]]]), sep="") + } } + + tDF <- dta@data[grepl(grepStrYrs, names(dta@data))] meltList[[i]] <- melt(tDF, id=idField) diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index fbe7a5e..ac41eda 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -1,6 +1,6 @@ # run linear model on data within year range as specified # by field prefix and return coefficients -timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield) { +timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { # create new dataframe from all columns in dta dataframe that # are either the ID or a year which is indicated by the prefix @@ -35,14 +35,27 @@ timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield) { # get all data corresponding to id from analysis dataframe ID_dat <- analysisDF[analysisDF[IDfield] == ID,] - # fit trend model - trend_mod <- lm(value ~ Year, data=ID_dat) + dat_length <-length(ID_dat) + count_na <-sum(is.na(ID_dat[['value']])) + count_non_na <- dat_length - count_na + percent_na <- count_na / dat_length + + # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA + if (percent_na > thresh || count_non_na < 2) { + + dta@data["newfieldID"][i,] <- NA + + } else { + # fit trend model + trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) + + # add trend coefficients to new field + dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] + } - # add trend coefficients to new field - dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] } # return new field with trend coefficients - return(dta$newfieldID) + return(dta[["newfieldID"]]) } From 716bd5589ce977badfefb0979c3584760914792c Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 11 Sep 2015 16:33:52 -0400 Subject: [PATCH 029/212] Fix syntax. --- R/BuildTimeSeries.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 7ca5721..f697687 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -153,9 +153,9 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY tempGrep <- grepStrYrs if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { - grepStrYrs <- paste(tempGrep,"|",paste(varList_pre[[i]]],years[[j]], sep="_"), sep="") + grepStrYrs <- paste(tempGrep,"|",paste(varList_pre[[i]],years[[j]], sep="_"), sep="") } else { - grepStrYrs <- paste(tempGrep,"|",gsub('####', years[[j]], varList_pre[[i]]]), sep="") + grepStrYrs <- paste(tempGrep,"|",gsub('####', years[[j]], varList_pre[[i]]), sep="") } } From a8f17ecf951f54d573c224ef2f007cc317dd016e Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 14 Sep 2015 17:15:16 -0400 Subject: [PATCH 030/212] Update how SpatialCausalPSM handles NAs. --- R/SpatialCausalPSM.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 4ba9ef2..72a35e7 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -5,10 +5,10 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { # generate model based on method if (mtd == "logit") { # generalized linear model - PSMfit <- glm(mdl, dta@data, family="binomial") + PSMfit <- glm(mdl, dta@data, family="binomial", na.action=na.omit) } else if (mtd == "lm") { # linear model - PSMfit <- lm(mdl, dta@data) + PSMfit <- lm(mdl, dta@data, na.action=na.omit) } # copy data From cb314e6f89682a099edec947b60cc961e1ad5c60 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 10:05:48 -0400 Subject: [PATCH 031/212] Update SpatialCausalPSM. --- R/SpatialCausalPSM.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 72a35e7..e1d49d3 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -31,9 +31,10 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { # Drop treated <- retData@data[retData@data[["TrtBin"]] == 1,] untreated <- retData@data[retData@data[["TrtBin"]] == 0,] - min_cut <- max(min(treated[["PSM_trtProb"]]), min(untreated[["PSM_trtProb"]])) - max_cut <- min(max(treated[["PSM_trtProb"]]), max(untreated[["PSM_trtProb"]])) + min_cut <- max(min(treated[["PSM_trtProb"]], na.rm = TRUE), min(untreated[["PSM_trtProb"]], na.rm = TRUE)) + max_cut <- min(max(treated[["PSM_trtProb"]], na.rm = TRUE), max(untreated[["PSM_trtProb"]], na.rm = TRUE)) + retData <- retData[!is.na(retData@data[["PSM_trtProb"]]),] retData <- retData[retData@data[["PSM_trtProb"]] >= min_cut,] retData <- retData[retData@data[["PSM_trtProb"]] <= max_cut,] @@ -49,8 +50,8 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { } # return original and predicted data along with model - retEle <- 0 - retEle[["data"]] <- retData - retEle[["model"]] <- PSMfit + retEle <- c() + retEle$data <- retData + retEle$model <- PSMfit return (retEle) } From 3013570920c22deb8ebf6a97de8d2a6285e6f5a2 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 10:09:54 -0400 Subject: [PATCH 032/212] Update. --- R/SpatialCausalPSM.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index e1d49d3..e862f53 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -5,10 +5,10 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { # generate model based on method if (mtd == "logit") { # generalized linear model - PSMfit <- glm(mdl, dta@data, family="binomial", na.action=na.omit) + PSMfit <- glm(mdl, dta@data, family="binomial") } else if (mtd == "lm") { # linear model - PSMfit <- lm(mdl, dta@data, na.action=na.omit) + PSMfit <- lm(mdl, dta@data) } # copy data From 4f7a3a1d24d7abe83e51d5b7dd6b341804372483 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 10:11:10 -0400 Subject: [PATCH 033/212] Update. --- R/SpatialCausalPSM.R | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index e862f53..e72e689 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -1,11 +1,12 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { + # Initialization pltObjs <- list() - + # generate model based on method if (mtd == "logit") { # generalized linear model - PSMfit <- glm(mdl, dta@data, family="binomial") + PSMfit <- glm(mdl, dta@data, family="binomial", na.action=na.omit) } else if (mtd == "lm") { # linear model PSMfit <- lm(mdl, dta@data) @@ -16,8 +17,8 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { # predict values based on model retData[["PSM_trtProb"]] <- predict(PSMfit, dta@data, type="response") - - + + if (visual == "TRUE") { # Show user distributions. pltObjs[[1]] <- GroupCompHist(retData, "PSM_trtProb", "Initial PSM Balance", simple_out=FALSE) @@ -26,7 +27,7 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { # Second, if a drop parameter - if set to "support", remove observations # that don't overlap in the PSM distribution. - if (drop == "support") { + #if (drop == "support") { # Drop treated <- retData@data[retData@data[["TrtBin"]] == 1,] @@ -37,8 +38,8 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { retData <- retData[!is.na(retData@data[["PSM_trtProb"]]),] retData <- retData[retData@data[["PSM_trtProb"]] >= min_cut,] retData <- retData[retData@data[["PSM_trtProb"]] <= max_cut,] - - } + + #} if (visual == "TRUE") { # Post drop histograms @@ -53,5 +54,3 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { retEle <- c() retEle$data <- retData retEle$model <- PSMfit - return (retEle) -} From 7bbb4482b9884c432ac66eefc047934866ba2366 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:06:59 -0400 Subject: [PATCH 034/212] Add debug prints to fastNN. --- R/fastNN_binary_func.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index fe2e52c..da3f19a 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -7,6 +7,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { + print("nn1.0") + #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. @@ -22,6 +24,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 + print("nn2.0") + #Calculate a distance decay function #to perturb pairs based on their distances. for (j in 1:it_cnt) { @@ -63,7 +67,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } } - + + print("nn3.0") + #Add the matched treatment and control values to the recording data frame #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) @@ -81,7 +87,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j,sep="") - + print("nn4.0") + #Add the Treatment ID to the Control Row dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID)] = k[["nn.dist"]][,1][best_m_control] From 5809656d8c120ac687da4234ffb6807425f5fdc3 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:08:28 -0400 Subject: [PATCH 035/212] Test. --- R/timeRangeTrend.R | 48 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index ac41eda..f57a6ed 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -24,38 +24,38 @@ timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] - # create empty field - dta@data["newfieldID"] <- 0 + # # create empty field + # dta@data["newfieldID"] <- 0 - # iterate over original dataframe - for (i in 1:length(dta)) { - # get id for row (in original data) - ID <- as.character(dta@data[IDfield][i,]) + # # iterate over original dataframe + # for (i in 1:length(dta)) { + # # get id for row (in original data) + # ID <- as.character(dta@data[IDfield][i,]) - # get all data corresponding to id from analysis dataframe - ID_dat <- analysisDF[analysisDF[IDfield] == ID,] + # # get all data corresponding to id from analysis dataframe + # ID_dat <- analysisDF[analysisDF[IDfield] == ID,] - dat_length <-length(ID_dat) - count_na <-sum(is.na(ID_dat[['value']])) - count_non_na <- dat_length - count_na - percent_na <- count_na / dat_length + # dat_length <-length(ID_dat) + # count_na <-sum(is.na(ID_dat[['value']])) + # count_non_na <- dat_length - count_na + # percent_na <- count_na / dat_length - # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA - if (percent_na > thresh || count_non_na < 2) { + # # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA + # if (percent_na > thresh || count_non_na < 2) { - dta@data["newfieldID"][i,] <- NA + # dta@data["newfieldID"][i,] <- NA - } else { - # fit trend model - trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) + # } else { + # # fit trend model + # trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) - # add trend coefficients to new field - dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] - } + # # add trend coefficients to new field + # dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] + # } - } + # } - # return new field with trend coefficients - return(dta[["newfieldID"]]) + # # return new field with trend coefficients + # return(dta[["newfieldID"]]) } From 76ea9b3c6b8b017200971fe55eede28cca6ada12 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:09:10 -0400 Subject: [PATCH 036/212] Test. --- R/timeRangeTrend.R | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index f57a6ed..4c0ba9d 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -2,27 +2,27 @@ # by field prefix and return coefficients timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { - # create new dataframe from all columns in dta dataframe that - # are either the ID or a year which is indicated by the prefix - grep_str = paste(IDfield, prefix, sep="|") - tDF <- dta@data[grepl(grep_str, names(dta@data))] + # # create new dataframe from all columns in dta dataframe that + # # are either the ID or a year which is indicated by the prefix + # grep_str = paste(IDfield, prefix, sep="|") + # tDF <- dta@data[grepl(grep_str, names(dta@data))] - # melt all years columns in new dataframe - analysisDF <- melt(tDF, id=c(IDfield)) + # # melt all years columns in new dataframe + # analysisDF <- melt(tDF, id=c(IDfield)) - # cleaned GREP - remove year digit placeholders - # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) + # # cleaned GREP - remove year digit placeholders + # # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) - # get location of year in prefix - yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) + # # get location of year in prefix + # yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) - # generate new year field by removing prefix from variable (original column names) - # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) - analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) + # # generate new year field by removing prefix from variable (original column names) + # # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) + # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) - # keep years in range specified - analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] - analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] + # # keep years in range specified + # analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] + # analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] # # create empty field # dta@data["newfieldID"] <- 0 From 823a2e3282574dfb7e6de54dff05cf976f0b0645 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:10:28 -0400 Subject: [PATCH 037/212] Test. --- R/timeRangeTrend.R | 83 ++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index 4c0ba9d..387d2a9 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -2,60 +2,63 @@ # by field prefix and return coefficients timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { - # # create new dataframe from all columns in dta dataframe that - # # are either the ID or a year which is indicated by the prefix - # grep_str = paste(IDfield, prefix, sep="|") - # tDF <- dta@data[grepl(grep_str, names(dta@data))] + # create new dataframe from all columns in dta dataframe that + # are either the ID or a year which is indicated by the prefix + grep_str = paste(IDfield, prefix, sep="|") + tDF <- dta@data[grepl(grep_str, names(dta@data))] - # # melt all years columns in new dataframe - # analysisDF <- melt(tDF, id=c(IDfield)) + # melt all years columns in new dataframe + analysisDF <- melt(tDF, id=c(IDfield)) - # # cleaned GREP - remove year digit placeholders - # # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) + # cleaned GREP - remove year digit placeholders + # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) - # # get location of year in prefix - # yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) + # get location of year in prefix + yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) - # # generate new year field by removing prefix from variable (original column names) - # # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) - # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) + # generate new year field by removing prefix from variable (original column names) + # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) + analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) - # # keep years in range specified - # analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] - # analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] + # keep years in range specified + analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] + analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] - # # create empty field - # dta@data["newfieldID"] <- 0 + # create empty field + dta@data["newfieldID"] <- 0 - # # iterate over original dataframe - # for (i in 1:length(dta)) { - # # get id for row (in original data) - # ID <- as.character(dta@data[IDfield][i,]) + # iterate over original dataframe + for (i in 1:length(dta)) { + # get id for row (in original data) + ID <- as.character(dta@data[IDfield][i,]) - # # get all data corresponding to id from analysis dataframe - # ID_dat <- analysisDF[analysisDF[IDfield] == ID,] + # get all data corresponding to id from analysis dataframe + ID_dat <- analysisDF[analysisDF[IDfield] == ID,] - # dat_length <-length(ID_dat) - # count_na <-sum(is.na(ID_dat[['value']])) - # count_non_na <- dat_length - count_na - # percent_na <- count_na / dat_length + dat_length <-length(ID_dat) + count_na <-sum(is.na(ID_dat[['value']])) + count_non_na <- dat_length - count_na + percent_na <- count_na / dat_length - # # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA - # if (percent_na > thresh || count_non_na < 2) { + # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA + if (percent_na > thresh || count_non_na < 2) { - # dta@data["newfieldID"][i,] <- NA + dta@data["newfieldID"][i,] <- NA - # } else { - # # fit trend model - # trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) + } else { + # fit trend model + trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) - # # add trend coefficients to new field - # dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] - # } + # add trend coefficients to new field + dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] + } - # } + } - # # return new field with trend coefficients - # return(dta[["newfieldID"]]) + # return new field with trend coefficients + return(dta[["newfieldID"]]) } + + + From 0998551973255516ce252bfd37b7a49bdaa22a69 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:11:00 -0400 Subject: [PATCH 038/212] Test. --- R/timeRangeTrend.R | 84 +++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index 387d2a9..0afd412 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -1,64 +1,64 @@ # run linear model on data within year range as specified # by field prefix and return coefficients -timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { +# timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { - # create new dataframe from all columns in dta dataframe that - # are either the ID or a year which is indicated by the prefix - grep_str = paste(IDfield, prefix, sep="|") - tDF <- dta@data[grepl(grep_str, names(dta@data))] +# # create new dataframe from all columns in dta dataframe that +# # are either the ID or a year which is indicated by the prefix +# grep_str = paste(IDfield, prefix, sep="|") +# tDF <- dta@data[grepl(grep_str, names(dta@data))] - # melt all years columns in new dataframe - analysisDF <- melt(tDF, id=c(IDfield)) +# # melt all years columns in new dataframe +# analysisDF <- melt(tDF, id=c(IDfield)) - # cleaned GREP - remove year digit placeholders - # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) +# # cleaned GREP - remove year digit placeholders +# # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) - # get location of year in prefix - yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) +# # get location of year in prefix +# yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) - # generate new year field by removing prefix from variable (original column names) - # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) - analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) +# # generate new year field by removing prefix from variable (original column names) +# # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) +# analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) - # keep years in range specified - analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] - analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] +# # keep years in range specified +# analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] +# analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] - # create empty field - dta@data["newfieldID"] <- 0 +# # create empty field +# dta@data["newfieldID"] <- 0 - # iterate over original dataframe - for (i in 1:length(dta)) { - # get id for row (in original data) - ID <- as.character(dta@data[IDfield][i,]) +# # iterate over original dataframe +# for (i in 1:length(dta)) { +# # get id for row (in original data) +# ID <- as.character(dta@data[IDfield][i,]) - # get all data corresponding to id from analysis dataframe - ID_dat <- analysisDF[analysisDF[IDfield] == ID,] +# # get all data corresponding to id from analysis dataframe +# ID_dat <- analysisDF[analysisDF[IDfield] == ID,] - dat_length <-length(ID_dat) - count_na <-sum(is.na(ID_dat[['value']])) - count_non_na <- dat_length - count_na - percent_na <- count_na / dat_length +# dat_length <-length(ID_dat) +# count_na <-sum(is.na(ID_dat[['value']])) +# count_non_na <- dat_length - count_na +# percent_na <- count_na / dat_length - # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA - if (percent_na > thresh || count_non_na < 2) { +# # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA +# if (percent_na > thresh || count_non_na < 2) { - dta@data["newfieldID"][i,] <- NA +# dta@data["newfieldID"][i,] <- NA - } else { - # fit trend model - trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) +# } else { +# # fit trend model +# trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) - # add trend coefficients to new field - dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] - } +# # add trend coefficients to new field +# dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] +# } - } +# } - # return new field with trend coefficients - return(dta[["newfieldID"]]) +# # return new field with trend coefficients +# return(dta[["newfieldID"]]) -} +# } From e81c87e5835f813db804104e34d2d8cc413972d6 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:12:10 -0400 Subject: [PATCH 039/212] Fix bug. --- R/SpatialCausalPSM.R | 3 ++ R/timeRangeTrend.R | 87 +++++++++++++++++++++----------------------- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index e72e689..ea2eec3 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -54,3 +54,6 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { retEle <- c() retEle$data <- retData retEle$model <- PSMfit + + return(retEle) +} diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index 0afd412..ac41eda 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -1,64 +1,61 @@ # run linear model on data within year range as specified # by field prefix and return coefficients -# timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { +timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { -# # create new dataframe from all columns in dta dataframe that -# # are either the ID or a year which is indicated by the prefix -# grep_str = paste(IDfield, prefix, sep="|") -# tDF <- dta@data[grepl(grep_str, names(dta@data))] + # create new dataframe from all columns in dta dataframe that + # are either the ID or a year which is indicated by the prefix + grep_str = paste(IDfield, prefix, sep="|") + tDF <- dta@data[grepl(grep_str, names(dta@data))] -# # melt all years columns in new dataframe -# analysisDF <- melt(tDF, id=c(IDfield)) + # melt all years columns in new dataframe + analysisDF <- melt(tDF, id=c(IDfield)) -# # cleaned GREP - remove year digit placeholders -# # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) + # cleaned GREP - remove year digit placeholders + # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) -# # get location of year in prefix -# yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) + # get location of year in prefix + yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) -# # generate new year field by removing prefix from variable (original column names) -# # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) -# analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) + # generate new year field by removing prefix from variable (original column names) + # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) + analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) -# # keep years in range specified -# analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] -# analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] + # keep years in range specified + analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] + analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] -# # create empty field -# dta@data["newfieldID"] <- 0 + # create empty field + dta@data["newfieldID"] <- 0 -# # iterate over original dataframe -# for (i in 1:length(dta)) { -# # get id for row (in original data) -# ID <- as.character(dta@data[IDfield][i,]) + # iterate over original dataframe + for (i in 1:length(dta)) { + # get id for row (in original data) + ID <- as.character(dta@data[IDfield][i,]) -# # get all data corresponding to id from analysis dataframe -# ID_dat <- analysisDF[analysisDF[IDfield] == ID,] + # get all data corresponding to id from analysis dataframe + ID_dat <- analysisDF[analysisDF[IDfield] == ID,] -# dat_length <-length(ID_dat) -# count_na <-sum(is.na(ID_dat[['value']])) -# count_non_na <- dat_length - count_na -# percent_na <- count_na / dat_length + dat_length <-length(ID_dat) + count_na <-sum(is.na(ID_dat[['value']])) + count_non_na <- dat_length - count_na + percent_na <- count_na / dat_length -# # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA -# if (percent_na > thresh || count_non_na < 2) { + # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA + if (percent_na > thresh || count_non_na < 2) { -# dta@data["newfieldID"][i,] <- NA + dta@data["newfieldID"][i,] <- NA -# } else { -# # fit trend model -# trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) + } else { + # fit trend model + trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) -# # add trend coefficients to new field -# dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] -# } + # add trend coefficients to new field + dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] + } -# } + } -# # return new field with trend coefficients -# return(dta[["newfieldID"]]) + # return new field with trend coefficients + return(dta[["newfieldID"]]) -# } - - - +} From 1f036d12d8fc27335b410b424043e5c9f43144ce Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:14:01 -0400 Subject: [PATCH 040/212] Test. --- R/fastNN_binary_func.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index da3f19a..f7bdf67 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -78,6 +78,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { best_m_treated = k[["nn.index"]][best_m_control] #Control PSM ID + print(untreated[[ids]][[best_m_control]]) Control_ID = toString(untreated[[ids]][[best_m_control]]) #Treatment PSM ID @@ -88,7 +89,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { pair_id = paste(curgrp,j,sep="") print("nn4.0") - + #Add the Treatment ID to the Control Row dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID)] = k[["nn.dist"]][,1][best_m_control] From 6eb42dc8d7d924fd6e74d9d58ccd62049ae78216 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:23:52 -0400 Subject: [PATCH 041/212] Test. --- R/fastNN_binary_func.R | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index f7bdf67..67e9b3c 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -77,12 +77,20 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #This will give us the matched index in the "treated" dataset. best_m_treated = k[["nn.index"]][best_m_control] - #Control PSM ID - print(untreated[[ids]][[best_m_control]]) - Control_ID = toString(untreated[[ids]][[best_m_control]]) + # #Control PSM ID + # Control_ID = toString(untreated[[ids]][[best_m_control]]) + # #Treatment PSM ID + # Treatment_ID = toString(treated[[ids]][[best_m_treated]]) + + #Control PSM ID + cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + Control_ID = toString(eval(parse(text=cid_txt))) + #Treatment PSM ID - Treatment_ID = toString(treated[[ids]][[best_m_treated]]) + tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + Treatment_ID = toString(eval(parse(text=tid_txt))) + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) From 9352e0b25dcf7d24aeb9760e576726ddb181b7cd Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:33:22 -0400 Subject: [PATCH 042/212] Test. --- R/BuildTimeSeries.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index f697687..ff3e8a4 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -52,6 +52,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (!is.null(interpYears)) { print("bts3.0") for (AncInt in 1:length(interpYears)) { + print(AncInt) print("bts3.0.0") cur_ancVi <- interpYears[AncInt] From 622e914c33baf17e27af7640b585a27d039c4dca Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:34:44 -0400 Subject: [PATCH 043/212] Test. --- R/BuildTimeSeries.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index ff3e8a4..3df67f6 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -86,14 +86,14 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Only one time point, so no interpolation is done - value is simply copied to all other columns. if (cnt == 3) { - + print("bts3.0.2a") for (k in 1:length(years)) { # add _year to end of non temporal data dta@data[[paste(cur_ancVi,years[[k]],sep="_")]] <- interpFrame[2] } } else if (cnt < length(years) + 2) { - + print("bts3.0.2b") tDframe <- dta@data[idField] # Here, we model out everything. From a06df241271b315dbcb276f334ab308f03c1ba64 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:35:29 -0400 Subject: [PATCH 044/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 3df67f6..0ca9c0e 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -52,7 +52,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (!is.null(interpYears)) { print("bts3.0") for (AncInt in 1:length(interpYears)) { - print(AncInt) + print(interpYears[AncInt]) print("bts3.0.0") cur_ancVi <- interpYears[AncInt] From bf2f2a2e8ca0a4dce04be8b779ae5c7bb7b72dc6 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:40:01 -0400 Subject: [PATCH 045/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 0ca9c0e..d9df0f0 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -12,7 +12,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY for (k in 1:length(years)) { for (j in 1:length(colYears)) { - varN <- paste("TrtMnt_",colYears[j],years[k], sep="") + varN <- paste("TrtMnt_",colYears[j],"_",years[k], sep="") print(varN) exec <- paste("dta$",varN,"=0", sep="") From 2fa6bfb94d5f2a5074913ad43553eef26aa3ff84 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:41:28 -0400 Subject: [PATCH 046/212] Test. --- R/BuildTimeSeries.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index d9df0f0..5f3a519 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -93,7 +93,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } } else if (cnt < length(years) + 2) { - print("bts3.0.2b") + print("bts3.0.2b0") tDframe <- dta@data[idField] # Here, we model out everything. @@ -104,11 +104,13 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Fit the model for interpolation + print("bts3.0.2b1") execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)", sep="") eval(parse(text=execstr)) # mdl <- lm(value ~ variable + factor(idField), data=melt_Model_dta) + print("bts3.0.2b2") # Apply the model to interpolate for (u in 1:length(years)) { From 1595c8f5af93dbee4156b9c4e6385999e7c91f9c Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:42:52 -0400 Subject: [PATCH 047/212] Test. --- R/BuildTimeSeries.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 5f3a519..6ee4124 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -81,6 +81,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } } + print(cnt) + print("bts3.0.2") # this is a slow part From a5ff1bf6864c0763f6ca26b137d9f41f2e9a193f Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:45:53 -0400 Subject: [PATCH 048/212] Test. --- R/BuildTimeSeries.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 6ee4124..4bda2de 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -28,6 +28,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY timer <- proc.time() - timer print(paste("section completed in", timer[3], "seconds.")) + print(colnames(dta)) print("bts2") timer <- proc.time() From dd1bf12b52ed726a7db9fc2530be45f7106408fd Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:48:26 -0400 Subject: [PATCH 049/212] Test. --- R/BuildTimeSeries.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 4bda2de..9719492 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -15,7 +15,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varN <- paste("TrtMnt_",colYears[j],"_",years[k], sep="") print(varN) - exec <- paste("dta$",varN,"=0", sep="") + exec <- paste("dta$",varN," = 0", sep="") eval(parse(text=exec)) # dta[,varN] = 0 @@ -28,7 +28,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY timer <- proc.time() - timer print(paste("section completed in", timer[3], "seconds.")) - print(colnames(dta)) + print(colnames(dta@data)) print("bts2") timer <- proc.time() From 0e8838b68ae2bae0711aa11a77eea9fa082d71b8 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:52:15 -0400 Subject: [PATCH 050/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 9719492..cdf8d1a 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -36,7 +36,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # add the "TrtMnt_" + colYears[j] prefix to interpYears for (j in 1:length(colYears)) { - trt_id = paste("TrtMnt_",colYears[j], sep="") + trt_id = paste("TrtMnt_",colYears[j],"_####", sep="") interpYears <- c(interpYears, trt_id) } From 70a5e48c45ff60a8acbffa40a590240e0e74ec88 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:56:31 -0400 Subject: [PATCH 051/212] Test. --- R/BuildTimeSeries.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index cdf8d1a..0289853 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -133,9 +133,10 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts3.1") # Append interpolated fields to our melting lists - for (v in 1:length(interpYears)) { - varList_pre[[length(varList_pre)+1]] <- interpYears[v] - } + varList_pre <- c(varList_pre, interpYears) + # for (v in 1:length(interpYears)) { + # varList_pre[[length(varList_pre)+1]] <- interpYears[v] + # } } From 1ec1c1c6251ae8def3a69e02eb9af306db7e0000 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 11:59:29 -0400 Subject: [PATCH 052/212] Test. --- R/BuildTimeSeries.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 0289853..ac5b34e 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -1,5 +1,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colYears=NULL, interpYears=NULL) { + print(varList_pre) + years <- startYear:endYear print("bts1") From 0d5bc8e37e18fa171b8abf5439ab2d6303493614 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 12:02:47 -0400 Subject: [PATCH 053/212] Test. --- R/BuildTimeSeries.R | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index ac5b34e..4f934a2 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -1,7 +1,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colYears=NULL, interpYears=NULL) { print(varList_pre) - + years <- startYear:endYear print("bts1") @@ -154,7 +154,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY meltList <- list() for (i in 1:length(varList_pre)) { - + print("bts4.0") # Limit to only relevant years grepStrYrs = idField @@ -168,20 +168,29 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } } + print("bts4.1") tDF <- dta@data[grepl(grepStrYrs, names(dta@data))] meltList[[i]] <- melt(tDF, id=idField) + print("bts4.2") + # Keep only years in the year column, rename columns colnames(meltList[[i]])[2] <- "Year" + + print("bts4.3") + colnames(meltList[[i]])[3] <- varList_pre[[i]] - + + print("bts4.4") + # Clean up year column gsub_command <- paste("^",varList_pre[[i]],sep="") meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) - + print("bts4.5") + # Remove ID and year if this is at least the second variable to avoid duplications. if (i > 1) { meltList[[i]] <- meltList[[i]][3] @@ -192,6 +201,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY timer <- proc.time() - timer print(paste("section completed in", timer[3], "seconds.")) + print("bts5") # Finish up with a cherry on top meltListRet <- data.frame(meltList) From 8515cdf56daa59712580ff5ebc5c973f3926dbd6 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 12:09:53 -0400 Subject: [PATCH 054/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 4f934a2..885d899 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -175,7 +175,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY meltList[[i]] <- melt(tDF, id=idField) print("bts4.2") - + print(colnames(meltList[[i]])) # Keep only years in the year column, rename columns colnames(meltList[[i]])[2] <- "Year" From 4a0702d15371d7a8b36460c89e151c5b92f29392 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 12:11:33 -0400 Subject: [PATCH 055/212] Test. --- R/BuildTimeSeries.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 885d899..157ea7b 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -155,6 +155,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY for (i in 1:length(varList_pre)) { print("bts4.0") + print(varList_pre[[i]]) + # Limit to only relevant years grepStrYrs = idField From 73c70278a8c0ab6449c74ee1d08a7c94d1141e6e Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 12:19:06 -0400 Subject: [PATCH 056/212] Test. --- R/BuildTimeSeries.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 157ea7b..136ce32 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -169,7 +169,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY grepStrYrs <- paste(tempGrep,"|",gsub('####', years[[j]], varList_pre[[i]]), sep="") } } - + print(grepStrYrs) + print("bts4.1") From df96e72f59c1c28bd5b104e65c3a9122b3012a88 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 12:20:54 -0400 Subject: [PATCH 057/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 136ce32..8fe40d9 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -172,7 +172,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(grepStrYrs) print("bts4.1") - + print(names(dta@data)) tDF <- dta@data[grepl(grepStrYrs, names(dta@data))] meltList[[i]] <- melt(tDF, id=idField) From b9f605749f4ebb149fd5e519a16304772b4b96cd Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 12:30:35 -0400 Subject: [PATCH 058/212] Test. --- R/BuildTimeSeries.R | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 8fe40d9..7e26f2c 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -64,23 +64,25 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY cnt = 2 print("bts3.0.1") - for (k in 1:length(years)) { - # First, build a model describing the relationship between years and any data in the interp field. - - # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. - varI <- gsub('####', years[[k]], cur_ancVi) - if (varI %in% colnames(dta@data)) { - - interpFrame[cnt] <- dta@data[[varI]] - colnames(interpFrame)[cnt] <- years[[k]] - cnt = cnt + 1 - - } else if (cur_ancVi %in% colnames(dta@data)) { - + if (cur_ancVi %in% colnames(dta@data)) { + print("IN!!") # Exception for a single-point interpolation interpFrame[cnt] <- dta@data[[cur_ancVi]] cnt = 3 + } else { + for (k in 1:length(years)) { + # First, build a model describing the relationship between years and any data in the interp field. + + # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. + varI <- gsub('####', years[[k]], cur_ancVi) + if (varI %in% colnames(dta@data)) { + + interpFrame[cnt] <- dta@data[[varI]] + colnames(interpFrame)[cnt] <- years[[k]] + cnt = cnt + 1 + + } } } From 6e0788e6081d61ea0f0c3d5c381add87b2a1fde3 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 12:55:11 -0400 Subject: [PATCH 059/212] Test. --- R/BuildTimeSeries.R | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 7e26f2c..21264ae 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -1,9 +1,11 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colYears=NULL, interpYears=NULL) { - print(varList_pre) - + # generate year range years <- startYear:endYear + + + print("bts1") timer <- proc.time() @@ -11,8 +13,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Eventually could be extended to more than one column. if (!is.null(colYears)) { # For each variable, for each year, create a binary representing the treatment status. - for (k in 1:length(years)) { - for (j in 1:length(colYears)) { + for (j in 1:length(colYears)) { + for (k in 1:length(years)) { varN <- paste("TrtMnt_",colYears[j],"_",years[k], sep="") print(varN) @@ -21,20 +23,13 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY eval(parse(text=exec)) # dta[,varN] = 0 - - dta@data[varN][dta@data[colYears[j]] <= as.numeric(years[k])] <- 1 + dta@data[varN][as.Date(dta@data[colYears[j]]) <= as.Date(years[k])] <- 1 } } } - timer <- proc.time() - timer - print(paste("section completed in", timer[3], "seconds.")) - print(colnames(dta@data)) - print("bts2") - timer <- proc.time() - # add the "TrtMnt_" + colYears[j] prefix to interpYears for (j in 1:length(colYears)) { @@ -44,8 +39,9 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(interpYears) - timer <- proc.time() - timer - print(paste("section completed in", timer[3], "seconds.")) + + + print("bts3") @@ -53,11 +49,13 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # If there is an "interpVars" variable, linearly interpolate values based on at least 2 known points in time. if (!is.null(interpYears)) { + print("bts3.0") + for (AncInt in 1:length(interpYears)) { + print(interpYears[AncInt]) - print("bts3.0.0") cur_ancVi <- interpYears[AncInt] interpFrame <- dta@data[idField] interpFrame[idField] <- dta@data[idField] @@ -65,7 +63,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts3.0.1") if (cur_ancVi %in% colnames(dta@data)) { - print("IN!!") # Exception for a single-point interpolation interpFrame[cnt] <- dta@data[[cur_ancVi]] cnt = 3 @@ -88,6 +85,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(cnt) + print("bts3.0.2") # this is a slow part @@ -138,17 +136,18 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts3.1") # Append interpolated fields to our melting lists varList_pre <- c(varList_pre, interpYears) - # for (v in 1:length(interpYears)) { - # varList_pre[[length(varList_pre)+1]] <- interpYears[v] - # } } + print(varList_pre) + timer <- proc.time() - timer print(paste("section completed in", timer[3], "seconds.")) + + print("bts4") timer <- proc.time() @@ -206,6 +205,10 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY timer <- proc.time() - timer print(paste("section completed in", timer[3], "seconds.")) + + + + print("bts5") # Finish up with a cherry on top From b73c000a010ad862869a72239c687c5c2ceb0b18 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 12:57:36 -0400 Subject: [PATCH 060/212] Test. --- R/BuildTimeSeries.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 21264ae..7d4f10e 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -14,6 +14,9 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (!is.null(colYears)) { # For each variable, for each year, create a binary representing the treatment status. for (j in 1:length(colYears)) { + + dta@data[colYears[j]] <- lapply(dta@data[colYears[j]], as.Date) + for (k in 1:length(years)) { varN <- paste("TrtMnt_",colYears[j],"_",years[k], sep="") @@ -23,7 +26,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY eval(parse(text=exec)) # dta[,varN] = 0 - dta@data[varN][as.Date(dta@data[colYears[j]]) <= as.Date(years[k])] <- 1 + + dta@data[varN][dta@data[colYears[j]] <= as.Date(years[k])] <- 1 } } } From 37642800984a281a491097ef94a52c7a47963a00 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:01:50 -0400 Subject: [PATCH 061/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 7d4f10e..9832437 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -27,7 +27,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # dta[,varN] = 0 - dta@data[varN][dta@data[colYears[j]] <= as.Date(years[k])] <- 1 + dta@data[varN][dta@data[colYears[j]] <= years[k]] <- 1 } } } From 9840b3b921c3f97a5c1da3627ce35a4bad2adb46 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:07:19 -0400 Subject: [PATCH 062/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 9832437..e9d4e11 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -27,7 +27,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # dta[,varN] = 0 - dta@data[varN][dta@data[colYears[j]] <= years[k]] <- 1 + dta@data[varN][dta@data[colYears[j]] <= as.Date(paste(years[k],"01","01", sep="-")] <- 1 } } } From f96c62d1b2bff01d98fbf58fa18a0b332318c908 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:07:51 -0400 Subject: [PATCH 063/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index e9d4e11..79256f1 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -27,7 +27,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # dta[,varN] = 0 - dta@data[varN][dta@data[colYears[j]] <= as.Date(paste(years[k],"01","01", sep="-")] <- 1 + dta@data[varN][dta@data[colYears[j]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 } } } From a8d737fbdc20c38a7dfb48293f1f6f7fad2566dc Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:20:09 -0400 Subject: [PATCH 064/212] Test. --- R/BuildTimeSeries.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 79256f1..320d346 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -22,12 +22,13 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varN <- paste("TrtMnt_",colYears[j],"_",years[k], sep="") print(varN) - exec <- paste("dta$",varN," = 0", sep="") - eval(parse(text=exec)) - # dta[,varN] = 0 + # exec <- paste("dta$",varN," = 0", sep="") + # eval(parse(text=exec)) + dta@data[[varN]] <- 0 - dta@data[varN][dta@data[colYears[j]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 + + dta@data[[varN]][dta@data[colYears[j]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 } } } From 68ca7a5cf7cd8c5d34a5ac96edf3b62487272dcb Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:23:35 -0400 Subject: [PATCH 065/212] Test. --- R/BuildTimeSeries.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 320d346..263561f 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -25,10 +25,16 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # exec <- paste("dta$",varN," = 0", sep="") # eval(parse(text=exec)) - dta@data[[varN]] <- 0 - + dta@data[[varN]] <- lapply(dta@data[colYears[j]], function (cYear) { + if (cYear <= as.Date(paste(years[k],"01","01", sep="-"))) { + return(1) + } else { + return(0) + } + }) - dta@data[[varN]][dta@data[colYears[j]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 + # dta@data[[varN]] <- 0 + # dta@data[[varN]][dta@data[colYears[j]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 } } } From 296caa8493c112370b74bce3ec26a31b4fa62f02 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:25:52 -0400 Subject: [PATCH 066/212] Test. --- R/BuildTimeSeries.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 263561f..b599725 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -39,6 +39,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } } + return dta + print(colnames(dta@data)) From fd97e1b0d0d1d58946049bfe48781357200d11a7 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:26:12 -0400 Subject: [PATCH 067/212] Test. --- R/BuildTimeSeries.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index b599725..ec6a68f 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -39,8 +39,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } } - return dta - + return(dta) + print(colnames(dta@data)) From 0f36d656d494c099ccc36185b093db8d5c913207 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:32:35 -0400 Subject: [PATCH 068/212] Test. --- R/BuildTimeSeries.R | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index ec6a68f..ae93f7b 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -15,7 +15,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # For each variable, for each year, create a binary representing the treatment status. for (j in 1:length(colYears)) { - dta@data[colYears[j]] <- lapply(dta@data[colYears[j]], as.Date) + # dta@data[colYears[j]] <- lapply(dta@data[colYears[j]], as.Date) for (k in 1:length(years)) { @@ -25,16 +25,17 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # exec <- paste("dta$",varN," = 0", sep="") # eval(parse(text=exec)) - dta@data[[varN]] <- lapply(dta@data[colYears[j]], function (cYear) { - if (cYear <= as.Date(paste(years[k],"01","01", sep="-"))) { - return(1) - } else { - return(0) - } - }) + # dta@data[[varN]] <- lapply(dta@data[colYears[j]], function (colYear_date) { + # if (colYear_date <= as.Date(paste(years[k],"01","01", sep="-"))) { + # return(1) + # } else { + # return(0) + # } + # }) + + dta@data[[varN]] <- 0 + dta@data[[varN]][dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 - # dta@data[[varN]] <- 0 - # dta@data[[varN]][dta@data[colYears[j]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 } } } From 2627231ecb689c3d0b8a6420d61035c845e037f2 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 13:55:53 -0400 Subject: [PATCH 069/212] Test. --- R/BuildTimeSeries.R | 91 +++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 53 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index ae93f7b..59e5e5b 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -5,9 +5,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY - print("bts1") - timer <- proc.time() # If there is a "colYears" variable, convert to binaries. # Eventually could be extended to more than one column. @@ -15,40 +13,20 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # For each variable, for each year, create a binary representing the treatment status. for (j in 1:length(colYears)) { - # dta@data[colYears[j]] <- lapply(dta@data[colYears[j]], as.Date) - for (k in 1:length(years)) { - varN <- paste("TrtMnt_",colYears[j],"_",years[k], sep="") + varN = paste("TrtMnt_",colYears[j],"_",years[k], sep="") print(varN) - # exec <- paste("dta$",varN," = 0", sep="") - # eval(parse(text=exec)) - - # dta@data[[varN]] <- lapply(dta@data[colYears[j]], function (colYear_date) { - # if (colYear_date <= as.Date(paste(years[k],"01","01", sep="-"))) { - # return(1) - # } else { - # return(0) - # } - # }) - dta@data[[varN]] <- 0 dta@data[[varN]][dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 } - } - } - - return(dta) - print(colnames(dta@data)) - - - # add the "TrtMnt_" + colYears[j] prefix to interpYears - for (j in 1:length(colYears)) { - trt_id = paste("TrtMnt_",colYears[j],"_####", sep="") - interpYears <- c(interpYears, trt_id) + # add the "TrtMnt_" + colYears[j] prefix to interpYears + trt_id = paste("TrtMnt_",colYears[j],"_####", sep="") + interpYears <- c(interpYears, trt_id) + } } print(interpYears) @@ -58,24 +36,29 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY - print("bts3") + print("bts2") timer <- proc.time() # If there is an "interpVars" variable, linearly interpolate values based on at least 2 known points in time. if (!is.null(interpYears)) { - print("bts3.0") - for (AncInt in 1:length(interpYears)) { print(interpYears[AncInt]) + cur_ancVi <- interpYears[AncInt] + + # create interpolation data frame and add id field interpFrame <- dta@data[idField] interpFrame[idField] <- dta@data[idField] + cnt = 2 - print("bts3.0.1") + + print("bts2.0.1") + + # add data to interpolation data frame if (cur_ancVi %in% colnames(dta@data)) { # Exception for a single-point interpolation interpFrame[cnt] <- dta@data[[cur_ancVi]] @@ -93,45 +76,47 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY colnames(interpFrame)[cnt] <- years[[k]] cnt = cnt + 1 - } + } } } print(cnt) - print("bts3.0.2") - # this is a slow part + print("bts2.0.2") - # Only one time point, so no interpolation is done - value is simply copied to all other columns. if (cnt == 3) { - print("bts3.0.2a") + # only one time point, so no interpolation is done - value is simply copied to all other columns. + + print("bts2.0.2a") for (k in 1:length(years)) { # add _year to end of non temporal data dta@data[[paste(cur_ancVi,years[[k]],sep="_")]] <- interpFrame[2] } } else if (cnt < length(years) + 2) { - print("bts3.0.2b0") - tDframe <- dta@data[idField] - - # Here, we model out everything. + # run model if data exists for at least two years but not for all years + print("bts2.0.2b0") # Melt the dataframe for modeling melt_Model_dta <- melt(data.frame(interpFrame), id=idField) melt_Model_dta["variable"] <- as.numeric(gsub("X", "", melt_Model_dta[["variable"]])) + print("bts2.0.2b1") + # Fit the model for interpolation - print("bts3.0.2b1") - + # this is a slow part execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)", sep="") eval(parse(text=execstr)) # mdl <- lm(value ~ variable + factor(idField), data=melt_Model_dta) - print("bts3.0.2b2") + + print("bts2.0.2b2") # Apply the model to interpolate + tDframe <- dta@data[idField] + for (u in 1:length(years)) { varI <- gsub('####', years[[u]], cur_ancVi) @@ -147,7 +132,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } - print("bts3.1") + print("bts2.1") # Append interpolated fields to our melting lists varList_pre <- c(varList_pre, interpYears) @@ -159,17 +144,17 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(paste("section completed in", timer[3], "seconds.")) + return(dta) - - print("bts4") + print("bts3") timer <- proc.time() # Run the melts meltList <- list() for (i in 1:length(varList_pre)) { - print("bts4.0") + print("bts3.0") print(varList_pre[[i]]) # Limit to only relevant years @@ -186,28 +171,28 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } print(grepStrYrs) - print("bts4.1") + print("bts3.1") print(names(dta@data)) tDF <- dta@data[grepl(grepStrYrs, names(dta@data))] meltList[[i]] <- melt(tDF, id=idField) - print("bts4.2") + print("bts3.2") print(colnames(meltList[[i]])) # Keep only years in the year column, rename columns colnames(meltList[[i]])[2] <- "Year" - print("bts4.3") + print("bts3.3") colnames(meltList[[i]])[3] <- varList_pre[[i]] - print("bts4.4") + print("bts3.4") # Clean up year column gsub_command <- paste("^",varList_pre[[i]],sep="") meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) - print("bts4.5") + print("bts3.5") # Remove ID and year if this is at least the second variable to avoid duplications. if (i > 1) { @@ -223,7 +208,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY - print("bts5") + print("bts4") # Finish up with a cherry on top meltListRet <- data.frame(meltList) From f0b68900f53b0697c981c39f4705eae8df8b501f Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 14:23:02 -0400 Subject: [PATCH 070/212] Test. --- R/BuildTimeSeries.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 59e5e5b..cdf4ff0 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -61,7 +61,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # add data to interpolation data frame if (cur_ancVi %in% colnames(dta@data)) { # Exception for a single-point interpolation - interpFrame[cnt] <- dta@data[[cur_ancVi]] + interpFrame[[cnt]] <- dta@data[[cur_ancVi]] cnt = 3 } else { @@ -72,7 +72,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varI <- gsub('####', years[[k]], cur_ancVi) if (varI %in% colnames(dta@data)) { - interpFrame[cnt] <- dta@data[[varI]] + interpFrame[[cnt]] <- dta@data[[varI]] colnames(interpFrame)[cnt] <- years[[k]] cnt = cnt + 1 From 700d531a899b599c7acfc9a55274cc5dd560f324 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:16:11 -0400 Subject: [PATCH 071/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index cdf4ff0..fae07ab 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -91,7 +91,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts2.0.2a") for (k in 1:length(years)) { # add _year to end of non temporal data - dta@data[[paste(cur_ancVi,years[[k]],sep="_")]] <- interpFrame[2] + dta@data[,paste(cur_ancVi,years[[k]],sep="_")] <- interpFrame[2] } } else if (cnt < length(years) + 2) { From 7aba41a38540ec289f0e0e873d8f7582dc0fa631 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:29:01 -0400 Subject: [PATCH 072/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index fae07ab..fb0815a 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -91,7 +91,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts2.0.2a") for (k in 1:length(years)) { # add _year to end of non temporal data - dta@data[,paste(cur_ancVi,years[[k]],sep="_")] <- interpFrame[2] + dta@data[paste(cur_ancVi,years[[k]],sep="_")] <- interpFrame[2] } } else if (cnt < length(years) + 2) { From 8baad95d5a040dfbf9c6ee229c8153326a0c3093 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:30:28 -0400 Subject: [PATCH 073/212] Test. --- R/BuildTimeSeries.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index fb0815a..06c38d2 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -144,8 +144,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(paste("section completed in", timer[3], "seconds.")) - return(dta) - print("bts3") timer <- proc.time() From 4a148fbbc0ef729a5cab2ffd58c60a66ab807b12 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:35:04 -0400 Subject: [PATCH 074/212] Test. --- R/BuildTimeSeries.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 06c38d2..fb0815a 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -144,6 +144,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(paste("section completed in", timer[3], "seconds.")) + return(dta) + print("bts3") timer <- proc.time() From ef5a7f3632940406df6f6fbdefcda3109636c6ca Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:40:44 -0400 Subject: [PATCH 075/212] Test. --- R/BuildTimeSeries.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index fb0815a..7a3c21a 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -144,7 +144,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(paste("section completed in", timer[3], "seconds.")) - return(dta) print("bts3") @@ -172,13 +171,13 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(grepStrYrs) print("bts3.1") - print(names(dta@data)) + # print(names(dta@data)) tDF <- dta@data[grepl(grepStrYrs, names(dta@data))] meltList[[i]] <- melt(tDF, id=idField) print("bts3.2") - print(colnames(meltList[[i]])) + # print(colnames(meltList[[i]])) # Keep only years in the year column, rename columns colnames(meltList[[i]])[2] <- "Year" From 9cc855b8246951cb6edb414ae08fccc0ebd8c5eb Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:52:31 -0400 Subject: [PATCH 076/212] Test. --- R/BuildTimeSeries.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 7a3c21a..0c08971 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -188,9 +188,16 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts3.4") # Clean up year column - gsub_command <- paste("^",varList_pre[[i]],sep="") - meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) + # gsub_command <- paste("^",varList_pre[[i]],sep="") + # meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) + if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,regexpr("####", varList_pre[[i]], fixed=TRUE)[1],nchar("####")}) + } else { + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,nchar(z)-nchar("####")+1,nchar(z)}) + } + + print("bts3.5") # Remove ID and year if this is at least the second variable to avoid duplications. From 3957e713892378698a02e8508404c68bb2e23b88 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:53:15 -0400 Subject: [PATCH 077/212] Test. --- R/BuildTimeSeries.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 0c08971..ec0367f 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -192,9 +192,9 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,regexpr("####", varList_pre[[i]], fixed=TRUE)[1],nchar("####")}) + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,regexpr("####", varList_pre[[i]], fixed=TRUE)[1],nchar("####"))}) } else { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,nchar(z)-nchar("####")+1,nchar(z)}) + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,nchar(z)-nchar("####")+1,nchar(z))}) } From ced445ee592be1a777280ff10fe1d4441f409271 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:54:47 -0400 Subject: [PATCH 078/212] Test. --- R/BuildTimeSeries.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index ec0367f..632fdbe 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -192,9 +192,11 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,regexpr("####", varList_pre[[i]], fixed=TRUE)[1],nchar("####"))}) - } else { meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,nchar(z)-nchar("####")+1,nchar(z))}) + + } else { + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,regexpr("####", varList_pre[[i]], fixed=TRUE)[1],nchar("####"))}) + } From c4f322b4ade90b16e06c097b30afbbb5a79cec62 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:56:18 -0400 Subject: [PATCH 079/212] Test. --- R/BuildTimeSeries.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 632fdbe..27670c2 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -192,10 +192,12 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,nchar(z)-nchar("####")+1,nchar(z))}) + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { + return(substr(z,nchar(z)-nchar("####")+1,nchar(z))) + }) } else { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return substr(z,regexpr("####", varList_pre[[i]], fixed=TRUE)[1],nchar("####"))}) + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return(substr(z,regexpr("####", varList_pre[[i]], fixed=TRUE)[1],nchar("####")))}) } From c9e154af5d68d8581595b68cd1b2c003e14e10c0 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 15:58:05 -0400 Subject: [PATCH 080/212] Test. --- R/BuildTimeSeries.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 27670c2..a137837 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -193,11 +193,15 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - return(substr(z,nchar(z)-nchar("####")+1,nchar(z))) - }) + print(z) + return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) + }) } else { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) {return(substr(z,regexpr("####", varList_pre[[i]], fixed=TRUE)[1],nchar("####")))}) + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { + print(z) + return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) + }) } From ce55dea03b433e736642c0f5fb2fab503cd4b5b4 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:02:41 -0400 Subject: [PATCH 081/212] Test. --- R/BuildTimeSeries.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index a137837..618c0e4 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -168,6 +168,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY grepStrYrs <- paste(tempGrep,"|",gsub('####', years[[j]], varList_pre[[i]]), sep="") } } + print(grepStrYrs) print("bts3.1") @@ -193,13 +194,15 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - print(z) + # print(z) + z = toString(z) return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) }) } else { meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - print(z) + # print(z) + z = toString(z) return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) }) From cf33f9ab4d28a9c61771e5955e157bf0c4fc7ef8 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:07:40 -0400 Subject: [PATCH 082/212] Test. --- R/BuildTimeSeries.R | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 618c0e4..b3e1870 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -192,21 +192,21 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # gsub_command <- paste("^",varList_pre[[i]],sep="") # meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) - if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - # print(z) - z = toString(z) - return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) - }) - - } else { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - # print(z) - z = toString(z) - return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) - }) - - } + # if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { + # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { + # # print(z) + # z = toString(z) + # return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) + # }) + + # } else { + # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { + # # print(z) + # z = toString(z) + # return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) + # }) + + # } print("bts3.5") From e49d4fccbac73c27e63ff6195e36324462420c3e Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:13:36 -0400 Subject: [PATCH 083/212] Test. --- R/BuildTimeSeries.R | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index b3e1870..2733ca7 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -192,28 +192,29 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # gsub_command <- paste("^",varList_pre[[i]],sep="") # meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) - # if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { - # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - # # print(z) - # z = toString(z) - # return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) - # }) - - # } else { - # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - # # print(z) - # z = toString(z) - # return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) - # }) - - # } - print("bts3.5") - # Remove ID and year if this is at least the second variable to avoid duplications. if (i > 1) { + # Remove ID and year after first pass to avoid duplications meltList[[i]] <- meltList[[i]][3] + + } else { + # for first pass format year + + # format year + meltList[[i]][2] <- lapply(meltList[[i]][2], as.character) + if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { + return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) + }) + + } else { + meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { + return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) + }) + + } } } From 2b03b4c0e6c9f8f566e87c0991259271b1069a7e Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:14:51 -0400 Subject: [PATCH 084/212] Test. --- R/BuildTimeSeries.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 2733ca7..3a88fa1 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -204,17 +204,17 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # format year meltList[[i]][2] <- lapply(meltList[[i]][2], as.character) - if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) - }) + # if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { + # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { + # return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) + # }) - } else { - meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) - }) + # } else { + # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { + # return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) + # }) - } + # } } } From f91c1eb1fd3853135b307c027486dea9d7df1b8a Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:17:29 -0400 Subject: [PATCH 085/212] Test. --- R/BuildTimeSeries.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 3a88fa1..ee4a61b 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -196,14 +196,14 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts3.5") if (i > 1) { - # Remove ID and year after first pass to avoid duplications + # remove id and year after first pass to avoid duplications meltList[[i]] <- meltList[[i]][3] - } else { + } #else { # for first pass format year - # format year - meltList[[i]][2] <- lapply(meltList[[i]][2], as.character) + # meltList[[i]][2] <- lapply(meltList[[i]][2], as.character) + # if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { # return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) @@ -215,7 +215,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # }) # } - } + # } } From 2766df74f7d20b6c0efc4174f59c9a5b1db57b2f Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:25:10 -0400 Subject: [PATCH 086/212] Test. --- R/BuildTimeSeries.R | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index ee4a61b..4e2897a 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -195,27 +195,15 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts3.5") - if (i > 1) { + if (i == 1) { + # set field to use for regex when formatting year field later + year_regex_field <- varList_pre[[i]] + + } else { # remove id and year after first pass to avoid duplications meltList[[i]] <- meltList[[i]][3] - } #else { - # for first pass format year - - # meltList[[i]][2] <- lapply(meltList[[i]][2], as.character) - - # if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { - # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - # return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) - # }) - - # } else { - # meltList[[i]][2] <- lapply(meltList[[i]][2], function (z) { - # return(substr(z, regexpr("####", varList_pre[[i]], fixed=TRUE)[1], nchar("####"))) - # }) - - # } - # } + } } @@ -228,9 +216,24 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts4") - # Finish up with a cherry on top + # convert meltList to data frame meltListRet <- data.frame(meltList) - + + # format year + meltListRet['Year'] <- lapply(meltListRet['Year'], as.character) + + if (regexpr("####", year_regex_field, fixed=TRUE)[1] == -1) { + meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { + return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) + }) + + } else { + meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { + return(substr(z, regexpr("####", year_regex_field, fixed=TRUE)[1], nchar("####"))) + }) + + } + return(meltListRet) } From fd96062b919054d55ee98bf1ad8d7b49dea74dd4 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:26:54 -0400 Subject: [PATCH 087/212] Test. --- R/BuildTimeSeries.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 4e2897a..c24deec 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -219,20 +219,20 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # convert meltList to data frame meltListRet <- data.frame(meltList) - # format year - meltListRet['Year'] <- lapply(meltListRet['Year'], as.character) + # # format year + # meltListRet['Year'] <- lapply(meltListRet['Year'], as.character) - if (regexpr("####", year_regex_field, fixed=TRUE)[1] == -1) { - meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { - return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) - }) + # if (regexpr("####", year_regex_field, fixed=TRUE)[1] == -1) { + # meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { + # return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) + # }) - } else { - meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { - return(substr(z, regexpr("####", year_regex_field, fixed=TRUE)[1], nchar("####"))) - }) + # } else { + # meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { + # return(substr(z, regexpr("####", year_regex_field, fixed=TRUE)[1], nchar("####"))) + # }) - } + # } return(meltListRet) } From 9f4c52d4bf745b7891df776265c2b0bbd3d3ae3d Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:37:30 -0400 Subject: [PATCH 088/212] Test. --- R/BuildTimeSeries.R | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index c24deec..c23c6d8 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -213,26 +213,23 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY - print("bts4") # convert meltList to data frame meltListRet <- data.frame(meltList) - # # format year - # meltListRet['Year'] <- lapply(meltListRet['Year'], as.character) - - # if (regexpr("####", year_regex_field, fixed=TRUE)[1] == -1) { - # meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { - # return(substr(z, nchar(z)-nchar("####")+1, nchar(z))) - # }) - - # } else { - # meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { - # return(substr(z, regexpr("####", year_regex_field, fixed=TRUE)[1], nchar("####"))) - # }) - - # } + # format year + regex_test <-regexpr("####", year_regex_field, fixed=TRUE)[1] + if (regex_test > -1) { + meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { + return(as.numeric(substr(z, regex_test, regex_test+nchar("####")-1))) + }) + + } else { + meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { + return(as.numeric(substr(z, nchar(z)-nchar("####")+1, nchar(z)))) + }) + } return(meltListRet) } From b2ecccaf02b49736a7f8d9223c0d052811a240e2 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:44:13 -0400 Subject: [PATCH 089/212] Test. --- R/BuildTimeSeries.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index c23c6d8..8d90d63 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -219,7 +219,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY meltListRet <- data.frame(meltList) # format year - regex_test <-regexpr("####", year_regex_field, fixed=TRUE)[1] + regex_test <- regexpr("####", year_regex_field, fixed=TRUE)[1] if (regex_test > -1) { meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { return(as.numeric(substr(z, regex_test, regex_test+nchar("####")-1))) @@ -231,6 +231,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY }) } + colnames(meltListRet) <- lapply(colnames(meltListRet), function (z) {return gsub("....", "#####", z, fixed=TRUE)}) + return(meltListRet) } From 705aba76120040fb18c6bba89452e56ffef0353b Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:45:07 -0400 Subject: [PATCH 090/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 8d90d63..2c7dfa5 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -231,7 +231,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY }) } - colnames(meltListRet) <- lapply(colnames(meltListRet), function (z) {return gsub("....", "#####", z, fixed=TRUE)}) + colnames(meltListRet) <- lapply(colnames(meltListRet), function (z) {return(gsub("....", "#####", z, fixed=TRUE))}) return(meltListRet) } From d904678a97a77621dc9d5c58a545cbef2275aee1 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:45:55 -0400 Subject: [PATCH 091/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 2c7dfa5..fe7afcf 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -231,7 +231,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY }) } - colnames(meltListRet) <- lapply(colnames(meltListRet), function (z) {return(gsub("....", "#####", z, fixed=TRUE))}) + colnames(meltListRet) <- lapply(colnames(meltListRet), function (z) {return(gsub("....", "####", z, fixed=TRUE))}) return(meltListRet) } From d6536bffb33833b7201943ae0408fd6afa8c2560 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:50:58 -0400 Subject: [PATCH 092/212] Test. --- R/Stage2PSM.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index a9567ba..bb64480 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -41,13 +41,17 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { } if (type == "cmreg") { + + print("c1") m_fit <- lm(model,dta) ret_var[["unstandardized"]] <- m_fit #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") #print.htmlTable(mTab) print(summary(m_fit)) texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Unstandardized Model",custom.note=model) - + + print("c2") + if (!is.null(table_out)) { dta_tmp <- dta @@ -65,6 +69,8 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { } + print("c3") + print(opts) exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") m_fit[["var"]] <- eval(parse(text=exec)) From bbd6e0fad6dc582a79e4f22c476d1d5576a4ba93 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 16:54:02 -0400 Subject: [PATCH 093/212] Test. --- R/Stage2PSM.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index bb64480..4f02ce8 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -42,9 +42,13 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { if (type == "cmreg") { - print("c1") + print("c1a") m_fit <- lm(model,dta) + print("c1b") + ret_var[["unstandardized"]] <- m_fit + + print("c1c") #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") #print.htmlTable(mTab) print(summary(m_fit)) @@ -68,7 +72,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Standardized Model", custom.note=model) } - + print("c3") print(opts) From e4e99f8551ef89184e64e679f3b778ed5757e194 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 17:04:48 -0400 Subject: [PATCH 094/212] Test. --- R/BuildTimeSeries.R | 18 ++++++++---------- R/Stage2PSM.R | 6 +++--- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index fe7afcf..cd07c88 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -24,7 +24,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } # add the "TrtMnt_" + colYears[j] prefix to interpYears - trt_id = paste("TrtMnt_",colYears[j],"_####", sep="") + trt_id = paste("TrtMnt_",colYears[j],"_....", sep="") interpYears <- c(interpYears, trt_id) } } @@ -69,7 +69,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # First, build a model describing the relationship between years and any data in the interp field. # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. - varI <- gsub('####', years[[k]], cur_ancVi) + varI <- gsub('....', years[[k]], cur_ancVi) if (varI %in% colnames(dta@data)) { interpFrame[[cnt]] <- dta@data[[varI]] @@ -119,7 +119,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY for (u in 1:length(years)) { - varI <- gsub('####', years[[u]], cur_ancVi) + varI <- gsub('....', years[[u]], cur_ancVi) if (!(varI %in% colnames(dta@data))) { # Variable doesn't exist, so we need to interpolate. tDframe[idField] <- dta@data[idField] @@ -162,10 +162,10 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY for (j in 1:length(years)) { tempGrep <- grepStrYrs - if (regexpr("####", varList_pre[[i]], fixed=TRUE)[1] == -1) { + if (regexpr("....", varList_pre[[i]], fixed=TRUE)[1] == -1) { grepStrYrs <- paste(tempGrep,"|",paste(varList_pre[[i]],years[[j]], sep="_"), sep="") } else { - grepStrYrs <- paste(tempGrep,"|",gsub('####', years[[j]], varList_pre[[i]]), sep="") + grepStrYrs <- paste(tempGrep,"|",gsub('....', years[[j]], varList_pre[[i]]), sep="") } } @@ -219,20 +219,18 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY meltListRet <- data.frame(meltList) # format year - regex_test <- regexpr("####", year_regex_field, fixed=TRUE)[1] + regex_test <- regexpr("....", year_regex_field, fixed=TRUE)[1] if (regex_test > -1) { meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { - return(as.numeric(substr(z, regex_test, regex_test+nchar("####")-1))) + return(as.numeric(substr(z, regex_test, regex_test+nchar("....")-1))) }) } else { meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { - return(as.numeric(substr(z, nchar(z)-nchar("####")+1, nchar(z)))) + return(as.numeric(substr(z, nchar(z)-nchar("....")+1, nchar(z)))) }) } - colnames(meltListRet) <- lapply(colnames(meltListRet), function (z) {return(gsub("....", "####", z, fixed=TRUE))}) - return(meltListRet) } diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 4f02ce8..2474dad 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -8,7 +8,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { ret_var <- list() if (type == "lm") { - m_fit <- lm(model,dta) + m_fit <- lm(model, dta) print("==========================") print("UNSTANDARDIZED MODEL RESULTS") print("==========================") @@ -43,9 +43,9 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { if (type == "cmreg") { print("c1a") - m_fit <- lm(model,dta) - print("c1b") + m_fit <- lm(model, dta) + print("c1b") ret_var[["unstandardized"]] <- m_fit print("c1c") From 512341ea2933f2e9d6c30c602d7aa5ca50a2d49c Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 17:09:11 -0400 Subject: [PATCH 095/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index cd07c88..8a77d7d 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -69,7 +69,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # First, build a model describing the relationship between years and any data in the interp field. # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. - varI <- gsub('....', years[[k]], cur_ancVi) + varI <- gsub('....', years[[k]], cur_ancVi, fixed=TRUE) if (varI %in% colnames(dta@data)) { interpFrame[[cnt]] <- dta@data[[varI]] From c9c9ad9938a9f17b534d5340834325bbef0d8fb0 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 17:11:20 -0400 Subject: [PATCH 096/212] Test. --- R/BuildTimeSeries.R | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 8a77d7d..5242bf2 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -119,7 +119,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY for (u in 1:length(years)) { - varI <- gsub('....', years[[u]], cur_ancVi) + varI <- gsub('....', years[[u]], cur_ancVi, fixed=TRUE) if (!(varI %in% colnames(dta@data))) { # Variable doesn't exist, so we need to interpolate. tDframe[idField] <- dta@data[idField] @@ -165,7 +165,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY if (regexpr("....", varList_pre[[i]], fixed=TRUE)[1] == -1) { grepStrYrs <- paste(tempGrep,"|",paste(varList_pre[[i]],years[[j]], sep="_"), sep="") } else { - grepStrYrs <- paste(tempGrep,"|",gsub('....', years[[j]], varList_pre[[i]]), sep="") + grepStrYrs <- paste(tempGrep,"|",gsub('....', years[[j]], varList_pre[[i]], fixed=TRUE), sep="") } } @@ -186,14 +186,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY colnames(meltList[[i]])[3] <- varList_pre[[i]] - print("bts3.4") - - # Clean up year column - # gsub_command <- paste("^",varList_pre[[i]],sep="") - # meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) - - print("bts3.5") + print("bts3.4") if (i == 1) { # set field to use for regex when formatting year field later From c799e4b86cdeb5d75ea51208e98dfd42e20c3ee0 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 17:14:04 -0400 Subject: [PATCH 097/212] Test. --- R/Stage2PSM.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 2474dad..fa9ef0e 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -76,8 +76,12 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { print("c3") print(opts) - exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") - m_fit[["var"]] <- eval(parse(text=exec)) + + # exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") + # m_fit[["var"]] <- eval(parse(text=exec)) + + m_fit[["var"]] <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]])) + CMREG <- coeftest(m_fit,m_fit[["var"]]) print("cmReg:") print(CMREG) From 15d6156ca4eb087fee1fd6c568324c3950c9f521 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 15 Sep 2015 17:15:22 -0400 Subject: [PATCH 098/212] Test. --- R/Stage2PSM.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index fa9ef0e..9998754 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -80,7 +80,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { # exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") # m_fit[["var"]] <- eval(parse(text=exec)) - m_fit[["var"]] <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]])) + m_fit[["var"]] <- cluster.vcov(m_fit,cbind(dta[opts[1]])) CMREG <- coeftest(m_fit,m_fit[["var"]]) print("cmReg:") From f161ffe7b1b1ff69b8b99222657c2cda6cbd6ed9 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 11:17:02 -0400 Subject: [PATCH 099/212] Test. --- R/Stage2PSM.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 9998754..18c8006 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -14,7 +14,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { print("==========================") #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") print(summary(m_fit)) - ret_var[["unstandardized"]] <- lm(model, dta) + ret_var$unstandardized <- lm(model, dta) texreg::plotreg(m_fit, omit.coef="(match)|(Intercept)", custom.model.names="Unstandardized Model", custom.note=model) if (!is.null(table_out)) { @@ -29,7 +29,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { } dta_fit_std <- lm(model,dta_tmp) - ret_var[["standardized"]] <- lm(model,dta_tmp) + ret_var$standardized <- lm(model,dta_tmp) print("==========================") print("STANDARDIZED MODEL RESULTS") print("==========================") @@ -46,7 +46,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { m_fit <- lm(model, dta) print("c1b") - ret_var[["unstandardized"]] <- m_fit + ret_var$unstandardized <- m_fit print("c1c") #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") @@ -67,7 +67,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) } dta_fit_std <- lm(model,dta_tmp) - ret_var[["standardized"]] <- dta_fit_std + ret_var$standardized <- dta_fit_std print(summary(dta_fit_std)) texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Standardized Model", custom.note=model) @@ -80,12 +80,12 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { # exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") # m_fit[["var"]] <- eval(parse(text=exec)) - m_fit[["var"]] <- cluster.vcov(m_fit,cbind(dta[opts[1]])) + m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]])) - CMREG <- coeftest(m_fit,m_fit[["var"]]) + CMREG <- coeftest(m_fit,m_fit$var) print("cmReg:") print(CMREG) - ret_var[["cmreg"]] <- CMREG + ret_var$cmreg <- CMREG } return(ret_var) From 82352ddd8e477a41770878ed375de5213df7ed62 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 11:21:22 -0400 Subject: [PATCH 100/212] Test. --- R/Stage2PSM.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 18c8006..c056eab 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -24,8 +24,8 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { d_index <- sapply(dta_tmp, is.numeric) dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) } else { - d_index <- sapply(dta_tmp@data, is.numeric) - dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) + d_index <- sapply(dta_tmp@data, is.numeric) + dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) } dta_fit_std <- lm(model,dta_tmp) @@ -80,7 +80,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { # exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") # m_fit[["var"]] <- eval(parse(text=exec)) - m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]])) + m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]])) CMREG <- coeftest(m_fit,m_fit$var) print("cmReg:") From 669d7bde15d0943e96e1630656ce042789b2e2a1 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 11:32:48 -0400 Subject: [PATCH 101/212] Test. --- R/BuildTimeSeries.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 5242bf2..b42c89f 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -216,12 +216,12 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY regex_test <- regexpr("....", year_regex_field, fixed=TRUE)[1] if (regex_test > -1) { meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { - return(as.numeric(substr(z, regex_test, regex_test+nchar("....")-1))) + return(as.integer(substr(z, regex_test, regex_test+nchar("....")-1))) }) } else { meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { - return(as.numeric(substr(z, nchar(z)-nchar("....")+1, nchar(z)))) + return(as.integer(substr(z, nchar(z)-nchar("....")+1, nchar(z)))) }) } From 2e89179ad77ae7729e8ac721f4c2ce2631fe071d Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 12:13:06 -0400 Subject: [PATCH 102/212] Test. --- R/BuildTimeSeries.R | 12 ++++++------ R/Stage2PSM.R | 10 +++++++++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index b42c89f..f9e7176 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -18,8 +18,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varN = paste("TrtMnt_",colYears[j],"_",years[k], sep="") print(varN) - dta@data[[varN]] <- 0 - dta@data[[varN]][dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 + dta@data[,varN] <- 0 + dta@data[,varN][dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 } @@ -61,7 +61,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # add data to interpolation data frame if (cur_ancVi %in% colnames(dta@data)) { # Exception for a single-point interpolation - interpFrame[[cnt]] <- dta@data[[cur_ancVi]] + interpFrame[cnt] <- dta@data[,cur_ancVi] cnt = 3 } else { @@ -72,7 +72,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varI <- gsub('....', years[[k]], cur_ancVi, fixed=TRUE) if (varI %in% colnames(dta@data)) { - interpFrame[[cnt]] <- dta@data[[varI]] + interpFrame[cnt] <- dta@data[,varI] colnames(interpFrame)[cnt] <- years[[k]] cnt = cnt + 1 @@ -91,7 +91,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print("bts2.0.2a") for (k in 1:length(years)) { # add _year to end of non temporal data - dta@data[paste(cur_ancVi,years[[k]],sep="_")] <- interpFrame[2] + dta@data[,paste(cur_ancVi,years[[k]],sep="_")] <- interpFrame[2] } } else if (cnt < length(years) + 2) { @@ -100,7 +100,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY # Melt the dataframe for modeling melt_Model_dta <- melt(data.frame(interpFrame), id=idField) - melt_Model_dta["variable"] <- as.numeric(gsub("X", "", melt_Model_dta[["variable"]])) + melt_Model_dta["variable"] <- as.numeric(gsub("X", "", melt_Model_dta[,"variable"])) print("bts2.0.2b1") diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index c056eab..c28d035 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -80,7 +80,15 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { # exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") # m_fit[["var"]] <- eval(parse(text=exec)) - m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]])) + if (length(opts) == 1) { + m_fit$var <- cluster.vcov(m_fit,dta[opts[1]])) + + } else if (length(opts) == 2) { + m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]])) + + } else { + return("CANNOT HAVE MORE THAN 2 OPTS") + } CMREG <- coeftest(m_fit,m_fit$var) print("cmReg:") From 099bcd5b6528f2db48b5b44c1394f52188534c58 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 12:13:58 -0400 Subject: [PATCH 103/212] Test. --- R/Stage2PSM.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index c28d035..4305cfe 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -81,7 +81,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { # m_fit[["var"]] <- eval(parse(text=exec)) if (length(opts) == 1) { - m_fit$var <- cluster.vcov(m_fit,dta[opts[1]])) + m_fit$var <- cluster.vcov(m_fit,dta[opts[1]]) } else if (length(opts) == 2) { m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]])) From 4d9ef9dd050deacaffcd561a0321cd63c3c32d3a Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 12:53:56 -0400 Subject: [PATCH 104/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index f9e7176..9fa05ed 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -19,7 +19,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(varN) dta@data[,varN] <- 0 - dta@data[,varN][dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 + dta@data[,varN][is.na(dta@data[[colYears[j]]]) || dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 } From b02c0c460d9d06032d1df57d7eab7a3db66483db Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 13:09:55 -0400 Subject: [PATCH 105/212] Test. --- R/SpatialCausalPSM.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index ea2eec3..d723fdf 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -16,7 +16,7 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { retData <- dta # predict values based on model - retData[["PSM_trtProb"]] <- predict(PSMfit, dta@data, type="response") + retData[,"PSM_trtProb"] <- predict(PSMfit, dta@data, type="response") if (visual == "TRUE") { @@ -30,14 +30,14 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { #if (drop == "support") { # Drop - treated <- retData@data[retData@data[["TrtBin"]] == 1,] - untreated <- retData@data[retData@data[["TrtBin"]] == 0,] - min_cut <- max(min(treated[["PSM_trtProb"]], na.rm = TRUE), min(untreated[["PSM_trtProb"]], na.rm = TRUE)) - max_cut <- min(max(treated[["PSM_trtProb"]], na.rm = TRUE), max(untreated[["PSM_trtProb"]], na.rm = TRUE)) + treated <- retData@data[retData@data[,"TrtBin"] == 1,] + untreated <- retData@data[retData@data[,"TrtBin"] == 0,] + min_cut <- max(min(treated[,"PSM_trtProb"], na.rm = TRUE), min(untreated[,"PSM_trtProb"], na.rm = TRUE)) + max_cut <- min(max(treated[,"PSM_trtProb"], na.rm = TRUE), max(untreated[,"PSM_trtProb"], na.rm = TRUE)) - retData <- retData[!is.na(retData@data[["PSM_trtProb"]]),] - retData <- retData[retData@data[["PSM_trtProb"]] >= min_cut,] - retData <- retData[retData@data[["PSM_trtProb"]] <= max_cut,] + retData <- retData[!is.na(retData@data[,"PSM_trtProb"]),] + retData <- retData[retData@data[,"PSM_trtProb"] >= min_cut,] + retData <- retData[retData@data[,"PSM_trtProb"] <= max_cut,] #} From 48b8d1a61c8081bf91fc12fbcdf46a1f0a2fc4c4 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 13:17:04 -0400 Subject: [PATCH 106/212] Test. --- R/BuildTimeSeries.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 9fa05ed..7bc86fa 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -19,7 +19,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(varN) dta@data[,varN] <- 0 - dta@data[,varN][is.na(dta@data[[colYears[j]]]) || dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 + dta@data[,varN][!is.na(dta@data[[colYears[j]]]) || dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 } From a4ca7856913b8dadda692c5b9dbf9c0df44e282f Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 13:19:17 -0400 Subject: [PATCH 107/212] Test. --- R/SpatialCausalPSM.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index d723fdf..992a39a 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -16,7 +16,7 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { retData <- dta # predict values based on model - retData[,"PSM_trtProb"] <- predict(PSMfit, dta@data, type="response") + retData@data[,"PSM_trtProb"] <- predict(PSMfit, dta@data, type="response") if (visual == "TRUE") { From 3f4b18f76c40065eb200a6352aea54cbd9d6479b Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 13:30:06 -0400 Subject: [PATCH 108/212] Test. --- R/SAT.R | 32 ++++++++++++++++---------------- R/SpatialCausalPSM.R | 6 +++--- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index dffaf8e..4f3fc83 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -12,10 +12,10 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if (!is.null(constraints)) { for (cst in 1:length(names(constraints))) { if (names(constraints)[cst] == "groups") { - dta[["ConstraintGroupSet_Opt"]] <- dta[[constraints["groups"]]] + dta@data[,"ConstraintGroupSet_Opt"] <- dta@data[,constraints["groups"]] } else { - dta[["ConstraintGroupSet_Opt"]] <- 1 + dta@data[,"ConstraintGroupSet_Opt"] <- 1 } if (names(constraints)[cst] == "distance") { dist_PSM = as.numeric(constraints["distance"][[1]]) @@ -24,7 +24,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } } } else { - dta[["ConstraintGroupSet_Opt"]] <- 1 + dta@data[,"ConstraintGroupSet_Opt"] <- 1 #max the distance threshold by taking the diagonal of the bounding box. dist_PSM = NULL } @@ -32,7 +32,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat2") #Caclulate the number of groups to constrain by, if any. - group_constraints <- unique(dta[["ConstraintGroupSet_Opt"]]) + group_constraints <- unique(dta@data[,"ConstraintGroupSet_Opt"]) #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) t_dta <- list() @@ -44,20 +44,20 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo grp_index = length(grp_list)+1 t_index = length(t_dta)+1 grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] - t_dta[[t_index]] <- dta[dta[["TrtBin"]] == 1,] - u_dta[[t_index]] <- dta[dta[["TrtBin"]] == 0,] - treatment_count <- cur_grp %in% t_dta[[t_index]][["ConstraintGroupSet_Opt"]] - untreated_count <- cur_grp %in% u_dta[[t_index]][["ConstraintGroupSet_Opt"]] + t_dta[[t_index]] <- dta@data[dta@data[,"TrtBin"] == 1,] + u_dta[[t_index]] <- dta@data[dta@data[,"TrtBin"] == 0,] + treatment_count <- cur_grp %in% t_dta[[t_index]]@data[,"ConstraintGroupSet_Opt"] + untreated_count <- cur_grp %in% u_dta[[t_index]]@data[,"ConstraintGroupSet_Opt"] if ((untreated_count == FALSE) || (treatment_count == FALSE)) { - dta <- dta[!dta[["ConstraintGroupSet_Opt"]] == cur_grp,] + dta <- dta@data[!dta@data[,"ConstraintGroupSet_Opt"] == cur_grp,] t_dta[[t_index]] <- NULL u_dta[[t_index]] <- NULL grp_list[[t_index]] <- NULL war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") warning(war_statement) } else { - t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]][["ConstraintGroupSet_Opt"]] == cur_grp,] - u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]][["ConstraintGroupSet_Opt"]] == cur_grp,] + t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]@data[,"ConstraintGroupSet_Opt"] == cur_grp,] + u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]@data[,"ConstraintGroupSet_Opt"] == cur_grp,] cnt = cnt + 1 } @@ -99,7 +99,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat5") if (drop_unmatched == TRUE) { - dta <- dta[dta@data[["PSM_match_ID"]] != -999,] + dta <- dta[dta@data[,"PSM_match_ID"] != -999,] } anc_v_int <- strsplit(psm_eq, "~")[[1]][2] @@ -111,11 +111,11 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #Drop observations according to the selected method if (drop_method == "SD") { #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. - psm_sd_thresh = sd(dta[["PSM_trtProb"]]) * drop_thresh + psm_sd_thresh = sd(dta@data[,"PSM_trtProb"]) * drop_thresh if (visual == "TRUE") { print(psm_sd_thresh) } - dta <- dta[dta@data[["PSM_distance"]] < psm_sd_thresh,] + dta <- dta[dta@data[,"PSM_distance"] < psm_sd_thresh,] } #Plot the pre and post-dropping balance for PSM model... @@ -139,9 +139,9 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat7.1") if (c_type == "matrix") { - dta@data[[ed_v]] <- as.numeric(dta@data[[ed_v]]) + dta@data[,ed_v] <- as.numeric(dta@data[,ed_v]) - init_dta@data[[ed_v]] <- as.numeric(init_dta@data[[ed_v]]) + init_dta@data[,ed_v] <- as.numeric(init_dta@data[,ed_v]) c_type = "numeric" } diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 992a39a..2fbccd5 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -35,9 +35,9 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { min_cut <- max(min(treated[,"PSM_trtProb"], na.rm = TRUE), min(untreated[,"PSM_trtProb"], na.rm = TRUE)) max_cut <- min(max(treated[,"PSM_trtProb"], na.rm = TRUE), max(untreated[,"PSM_trtProb"], na.rm = TRUE)) - retData <- retData[!is.na(retData@data[,"PSM_trtProb"]),] - retData <- retData[retData@data[,"PSM_trtProb"] >= min_cut,] - retData <- retData[retData@data[,"PSM_trtProb"] <= max_cut,] + retData <- retData@data[!is.na(retData@data[,"PSM_trtProb"]),] + retData <- retData@data[retData@data[,"PSM_trtProb"] >= min_cut,] + retData <- retData@data[retData@data[,"PSM_trtProb"] <= max_cut,] #} From 691db70f6328fed07defa21453a4bb4d753da0e8 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 13:36:03 -0400 Subject: [PATCH 109/212] Test. --- R/SAT.R | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 4f3fc83..f0ae8ce 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -7,6 +7,8 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo drop_method = drop_opts["drop_method"] drop_thresh = as.numeric(drop_opts["drop_thresh"]) + + print("sat1") if (!is.null(constraints)) { @@ -15,7 +17,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo dta@data[,"ConstraintGroupSet_Opt"] <- dta@data[,constraints["groups"]] } else { - dta@data[,"ConstraintGroupSet_Opt"] <- 1 + dta$ConstraintGroupSet_Opt <- 1 } if (names(constraints)[cst] == "distance") { dist_PSM = as.numeric(constraints["distance"][[1]]) @@ -24,44 +26,54 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } } } else { - dta@data[,"ConstraintGroupSet_Opt"] <- 1 + dta$ConstraintGroupSet_Opt <- 1 #max the distance threshold by taking the diagonal of the bounding box. dist_PSM = NULL } + + print("sat2") #Caclulate the number of groups to constrain by, if any. - group_constraints <- unique(dta@data[,"ConstraintGroupSet_Opt"]) + group_constraints <- unique(dta$ConstraintGroupSet_Opt) #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) t_dta <- list() u_dta <-list() grp_list <- list() cnt = 0 + for (grp in 1:length(group_constraints)) { cur_grp <- as.matrix(group_constraints)[grp] grp_index = length(grp_list)+1 t_index = length(t_dta)+1 grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] - t_dta[[t_index]] <- dta@data[dta@data[,"TrtBin"] == 1,] - u_dta[[t_index]] <- dta@data[dta@data[,"TrtBin"] == 0,] - treatment_count <- cur_grp %in% t_dta[[t_index]]@data[,"ConstraintGroupSet_Opt"] - untreated_count <- cur_grp %in% u_dta[[t_index]]@data[,"ConstraintGroupSet_Opt"] + + t_dta[[t_index]] <- dta[dta$TrtBin == 1,] + u_dta[[t_index]] <- dta[dta$TrtBin == 0,] + + treatment_count <- cur_grp %in% t_dta[[t_index]]$ConstraintGroupSet_Opt + untreated_count <- cur_grp %in% u_dta[[t_index]]$ConstraintGroupSet_Opt + if ((untreated_count == FALSE) || (treatment_count == FALSE)) { - dta <- dta@data[!dta@data[,"ConstraintGroupSet_Opt"] == cur_grp,] + dta <- dta[!dta$ConstraintGroupSet_Opt == cur_grp,] t_dta[[t_index]] <- NULL u_dta[[t_index]] <- NULL grp_list[[t_index]] <- NULL war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") warning(war_statement) + } else { - t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]@data[,"ConstraintGroupSet_Opt"] == cur_grp,] - u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]@data[,"ConstraintGroupSet_Opt"] == cur_grp,] + t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] cnt = cnt + 1 } } + + + print("sat3") temp_dta <- list() From 227718ea872bee747fac6d2a4cafe7bd3926149e Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 13:50:07 -0400 Subject: [PATCH 110/212] Test. --- R/BuildTimeSeries.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 7bc86fa..cb082ea 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -19,7 +19,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(varN) dta@data[,varN] <- 0 - dta@data[,varN][!is.na(dta@data[[colYears[j]]]) || dta@data[[colYears[j]]] <= as.Date(paste(years[k],"01","01", sep="-"))] <- 1 + dta@data[,varN][!is.na(dta@data[[colYears[j]]]) && dta@data[[colYears[j]]] <= years[k]] <- 1 } @@ -35,7 +35,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY - print("bts2") timer <- proc.time() From a6b8b763eb7b196ade0e6c10579e216383eaae32 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 13:54:52 -0400 Subject: [PATCH 111/212] Test. --- R/SpatialCausalPSM.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 2fbccd5..992a39a 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -35,9 +35,9 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { min_cut <- max(min(treated[,"PSM_trtProb"], na.rm = TRUE), min(untreated[,"PSM_trtProb"], na.rm = TRUE)) max_cut <- min(max(treated[,"PSM_trtProb"], na.rm = TRUE), max(untreated[,"PSM_trtProb"], na.rm = TRUE)) - retData <- retData@data[!is.na(retData@data[,"PSM_trtProb"]),] - retData <- retData@data[retData@data[,"PSM_trtProb"] >= min_cut,] - retData <- retData@data[retData@data[,"PSM_trtProb"] <= max_cut,] + retData <- retData[!is.na(retData@data[,"PSM_trtProb"]),] + retData <- retData[retData@data[,"PSM_trtProb"] >= min_cut,] + retData <- retData[retData@data[,"PSM_trtProb"] <= max_cut,] #} From 3d5c4a5ea12c6e382a9fde1644b8c87bc9136da8 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 16:12:39 -0400 Subject: [PATCH 112/212] Test. --- R/BuildTimeSeries.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index cb082ea..455bfe0 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -19,7 +19,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(varN) dta@data[,varN] <- 0 - dta@data[,varN][!is.na(dta@data[[colYears[j]]]) && dta@data[[colYears[j]]] <= years[k]] <- 1 + # dta@data[,varN][!is.na(dta@data[[colYears[j]]]) && dta@data[[colYears[j]]] <= years[k]] <- 1 + dta@data[,varN][dta@data[[colYears[j]]] <= years[k]] <- 1 } From de90481afdd81147603347cfb24d3d5f9c43397c Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 16:24:59 -0400 Subject: [PATCH 113/212] Test. --- R/BuildTimeSeries.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 455bfe0..a526637 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -18,9 +18,9 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY varN = paste("TrtMnt_",colYears[j],"_",years[k], sep="") print(varN) - dta@data[,varN] <- 0 - # dta@data[,varN][!is.na(dta@data[[colYears[j]]]) && dta@data[[colYears[j]]] <= years[k]] <- 1 - dta@data[,varN][dta@data[[colYears[j]]] <= years[k]] <- 1 + dta@data[varN] <- 0 + dta@data[varN][!is.na(dta@data[colYears[j]]) && dta@data[colYears[j]] <= years[k]] <- 1 + # dta@data[varN][dta@data[colYears[j]] <= years[k]] <- 1 } From 1ad0accffda348de38470ed9b4757c9b9241de23 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 16 Sep 2015 16:30:58 -0400 Subject: [PATCH 114/212] Test. --- R/BuildTimeSeries.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index a526637..5e363d7 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -19,8 +19,9 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(varN) dta@data[varN] <- 0 - dta@data[varN][!is.na(dta@data[colYears[j]]) && dta@data[colYears[j]] <= years[k]] <- 1 - # dta@data[varN][dta@data[colYears[j]] <= years[k]] <- 1 + # dta@data[varN][!is.na(dta@data[colYears[j]]) && dta@data[colYears[j]] <= years[k]] <- 1 + dta@data[varN][dta@data[colYears[j]] <= years[k]] <- 1 + dta@data[varN][is.na(dta@data[colYears[j]])] <- 0 } From 21f881e9b9e6710981e06cac09d74f8073d4d1bf Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 17 Sep 2015 13:21:59 -0400 Subject: [PATCH 115/212] Test. --- R/BuildTimeSeries.R | 1 - R/Stage2PSM.R | 13 +++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 5e363d7..0f510fc 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -19,7 +19,6 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY print(varN) dta@data[varN] <- 0 - # dta@data[varN][!is.na(dta@data[colYears[j]]) && dta@data[colYears[j]] <= years[k]] <- 1 dta@data[varN][dta@data[colYears[j]] <= years[k]] <- 1 dta@data[varN][is.na(dta@data[colYears[j]])] <- 0 diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 4305cfe..1418eb0 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -42,6 +42,17 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { if (type == "cmreg") { + # make sure cluster options provided are valid + if (length(opts) == 0 || opts == NULL) { + print("Must have at least 1 clustering option") + return("Invalid opts given.") + + } else if (length(opts) > 2) { + print("Cannot have more than 2 clustering options") + return("Invalid opts given.") + } + + print("c1a") m_fit <- lm(model, dta) @@ -86,8 +97,6 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { } else if (length(opts) == 2) { m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]])) - } else { - return("CANNOT HAVE MORE THAN 2 OPTS") } CMREG <- coeftest(m_fit,m_fit$var) From 7d1e73811a6803f15199c8b359a5cba878d7c3eb Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 17 Sep 2015 13:24:36 -0400 Subject: [PATCH 116/212] Test. --- R/Stage2PSM.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 1418eb0..58b4712 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -43,7 +43,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { if (type == "cmreg") { # make sure cluster options provided are valid - if (length(opts) == 0 || opts == NULL) { + if (opts == NULL || length(opts) == 0) { print("Must have at least 1 clustering option") return("Invalid opts given.") @@ -68,6 +68,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { print("c2") if (!is.null(table_out)) { + dta_tmp <- dta if( class(dta) == "data.frame") { From 292d0b14af55a36e6d82e3a40ce00673bc6ce8da Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 17 Sep 2015 13:26:10 -0400 Subject: [PATCH 117/212] Test. --- R/Stage2PSM.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 58b4712..5296f69 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -43,6 +43,9 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { if (type == "cmreg") { # make sure cluster options provided are valid + print(opts == NULL) + print(length(opts) == 0) + if (opts == NULL || length(opts) == 0) { print("Must have at least 1 clustering option") return("Invalid opts given.") @@ -68,7 +71,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { print("c2") if (!is.null(table_out)) { - + dta_tmp <- dta if( class(dta) == "data.frame") { From 3831022bcc1aca59ad517c5b68d4c88024d4fc32 Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 17 Sep 2015 13:30:30 -0400 Subject: [PATCH 118/212] Test. --- R/Stage2PSM.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index 5296f69..bb3a261 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -45,8 +45,8 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { # make sure cluster options provided are valid print(opts == NULL) print(length(opts) == 0) - - if (opts == NULL || length(opts) == 0) { + + if (is.null(opts) || length(opts) == 0) { print("Must have at least 1 clustering option") return("Invalid opts given.") From cb1589288d99843b8a7c910fb5e33e36fb4c8d77 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 9 Oct 2015 14:17:34 -0400 Subject: [PATCH 119/212] Fix fitting same model multiple times in stage2psm. --- R/Stage2PSM.R | 10 +-- R/timeRangeAvg.R | 22 +++--- R/timeRangeTrend.R | 171 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 186 insertions(+), 17 deletions(-) diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index bb3a261..8936173 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -3,7 +3,7 @@ #These functions are to make common modeling strategies easier to specify for users #That do not write their own models. -Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { +Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL, force_posdef = TRUE) { ret_var <- list() @@ -14,7 +14,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { print("==========================") #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") print(summary(m_fit)) - ret_var$unstandardized <- lm(model, dta) + ret_var$unstandardized <- m_fit texreg::plotreg(m_fit, omit.coef="(match)|(Intercept)", custom.model.names="Unstandardized Model", custom.note=model) if (!is.null(table_out)) { @@ -29,7 +29,7 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { } dta_fit_std <- lm(model,dta_tmp) - ret_var$standardized <- lm(model,dta_tmp) + ret_var$standardized <- dta_fit_std print("==========================") print("STANDARDIZED MODEL RESULTS") print("==========================") @@ -96,10 +96,10 @@ Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL) { # m_fit[["var"]] <- eval(parse(text=exec)) if (length(opts) == 1) { - m_fit$var <- cluster.vcov(m_fit,dta[opts[1]]) + m_fit$var <- cluster.vcov(m_fit,dta[opts[1]], force_posdef=force_posdef) } else if (length(opts) == 2) { - m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]])) + m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]]), force_posdef=force_posdef) } diff --git a/R/timeRangeAvg.R b/R/timeRangeAvg.R index 1af3cfd..c378173 100644 --- a/R/timeRangeAvg.R +++ b/R/timeRangeAvg.R @@ -1,9 +1,13 @@ -timeRangeAvg <- function(dta,prefix,startyr,endyr) -{ - searchS = paste("^",prefix,startyr,sep="") - searchE = paste("^",prefix,endyr,sep="") - strt_id <- grep(searchS,colnames(dta)) - end_id <- grep(searchE,colnames(dta)) - rmean <- rowMeans(dta[strt_id:end_id]) - return(rmean) -} \ No newline at end of file +timeRangeAvg <- function(dta, prefix, startyr, endyr) { + + searchS = paste("^",prefix,startyr, sep="") + searchE = paste("^",prefix,endyr, sep="") + + start_id <- grep(searchS, colnames(dta)) + end_id <- grep(searchE, colnames(dta)) + + rmean <- rowMeans(dta[start_id:end_id]) + + return(rmean) + +} diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index ac41eda..6fbe27e 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -1,6 +1,96 @@ + + + +timeRangeType <- function (columns, prefix, startyr, endyr, field) { + + if (!is.na(as.numeric(startyr) && is.na(as.integer(endyr) && !is.na(field) && field %in% column_names) { + type = "pre" + startyr = as.integer(startyr) + + } else if (is.na(as.numeric(startyr) && !is.na(as.integer(endyr) && !is.na(field) && field %in% column_names) { + type = "post" + endyr = as.integer(endyr) + + } else if (!is.na(as.numeric(startyr) && !is.na(as.integer(endyr) && as.integer(endyr) > as.integer(startyr)) { + type = "range" + startyr = as.integer(startyr) + endyr = as.integer(endyr) + + } else { + type = "invalid" + } + + return(c(type, startyr, endyr)) + +} + + + # run linear model on data within year range as specified # by field prefix and return coefficients -timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { +timeRangeTrend <- function (dta, prefix, startyr, endyr, field=NA, IDfield, thresh=0.5) { + + check <- timeRangeType(colnames(dta), prefix, startyr, endyr, field) + + type = check[1] + startyr = check[2] + endyr = check[3] + + + if (type == "range") { + + output <- timeRangeTrend_calc(dta, prefix, startyr, tmp_endyr, IDfield, thresh=0.5) + + } else if (type == "pre") { + + output <- apply(dta, 1, function (row) { + + tmp_endyr <- as.integer(row['start_actual_isodate']) + + if (is.na(tmp_endyr) || start_yr >= tmp_endyr) { + return(as.integer("NA")) + + } else { + return(timeRangeTrend_calc(row, prefix, startyr, tmp_endyr, IDfield, thresh=0.5, field="start_actual_isodate")) + + } + + }) + + + } else if (type == "post") { + + output <- lapply(dta, function (row) { + + tmp_startyr <- as.integer(row['start_actual_isodate']) + + if (is.na(tmp_startyr) || tmp_startyr >= endyr) { + return(as.integer("NA")) + + } else { + return(timeRangeTrend_calc(row, prefix, tmp_startyr, endyr, IDfield, thresh=0.5, field="start_actual_isodate")) + + } + + }) + + + } else if (type == "invalid") { + output <- 1 + + } else { + output <- 2 + + } + + return(output) + +} + + + +timeRangeTrend_calc <- function (dta, prefix, startyr, endyr, IDfield, thresh=0.5) { + # create new dataframe from all columns in dta dataframe that # are either the ID or a year which is indicated by the prefix @@ -18,7 +108,9 @@ timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { # generate new year field by removing prefix from variable (original column names) # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) - analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(substr(x, yIndex[1], yIndex[1]+3))) + analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) { + as.numeric(substr(x, yIndex[1], yIndex[1]+3)) + }) # keep years in range specified analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] @@ -36,7 +128,7 @@ timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { ID_dat <- analysisDF[analysisDF[IDfield] == ID,] dat_length <-length(ID_dat) - count_na <-sum(is.na(ID_dat[['value']])) + count_na <-sum(is.na(ID_dat[['value']])) count_non_na <- dat_length - count_na percent_na <- count_na / dat_length @@ -59,3 +151,76 @@ timeRangeTrend <- function(dta, prefix, startyr, endyr, IDfield, thresh=0.5) { return(dta[["newfieldID"]]) } + + + +timeRangeAvg <- function (dta, prefix, startyr, endyr, field=NA) { + + check <- timeRangeType(colnames(dta), prefix, startyr, endyr, field) + + type = check[1] + startyr = check[2] + endyr = check[3] + + + if (type == "range") { + + output <- timeRangeAvg_calc(dta, prefix, startyr, endyr) + + } else if (type == "pre") { + + output <- apply(dta, 1, function (row) { + + tmp_endyr <- as.integer(row['start_actual_isodate']) + + if (is.na(tmp_endyr) || start_yr >= tmp_endyr) { + return(as.integer("NA")) + + } else { + return(timeRangeAvg_calc(row, prefix, startyr, tmp_endyr)) + + } + + }) + + + } else if (type == "post") { + + output <- lapply(dta, function (row) { + + tmp_startyr <- as.integer(row['start_actual_isodate']) + + if (is.na(tmp_startyr) || tmp_startyr >= endyr) { + return(as.integer("NA")) + + } else { + return(timeRangeAvg_calc(row, prefix, tmp_startyr, endyr)) + + } + + }) + + } else if (type == "invalid") { + output <- 1 + + } else { + output <- 2 + + } + + return(output) + +} + + + +timeRangeAvg_calc <- function (dta, prefix, startyr, endyr) { + + range <- c(startyr:endyr) + search <- paste("^",prefix,"(",paste(range, collapse="|"),")", sep="") + matches <- grepl(search, colnames(dta)) + rmean <- rowMeans(dta[matches], na.rm=FALSE) + + return(rmean) + +} From 2c5d8a5f3f266ece557df01573a21b2e63ccedde Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 9 Oct 2015 14:19:11 -0400 Subject: [PATCH 120/212] Temp. remove timerangetrend. --- R/timeRangeTrend.R | 266 ++++++++++++++++++++++----------------------- 1 file changed, 133 insertions(+), 133 deletions(-) diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index 6fbe27e..cb196ce 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -1,226 +1,226 @@ -timeRangeType <- function (columns, prefix, startyr, endyr, field) { +# timeRangeType <- function (columns, prefix, startyr, endyr, field) { - if (!is.na(as.numeric(startyr) && is.na(as.integer(endyr) && !is.na(field) && field %in% column_names) { - type = "pre" - startyr = as.integer(startyr) +# if (!is.na(as.numeric(startyr) && is.na(as.integer(endyr) && !is.na(field) && field %in% column_names) { +# type = "pre" +# startyr = as.integer(startyr) - } else if (is.na(as.numeric(startyr) && !is.na(as.integer(endyr) && !is.na(field) && field %in% column_names) { - type = "post" - endyr = as.integer(endyr) +# } else if (is.na(as.numeric(startyr) && !is.na(as.integer(endyr) && !is.na(field) && field %in% column_names) { +# type = "post" +# endyr = as.integer(endyr) - } else if (!is.na(as.numeric(startyr) && !is.na(as.integer(endyr) && as.integer(endyr) > as.integer(startyr)) { - type = "range" - startyr = as.integer(startyr) - endyr = as.integer(endyr) +# } else if (!is.na(as.numeric(startyr) && !is.na(as.integer(endyr) && as.integer(endyr) > as.integer(startyr)) { +# type = "range" +# startyr = as.integer(startyr) +# endyr = as.integer(endyr) - } else { - type = "invalid" - } +# } else { +# type = "invalid" +# } - return(c(type, startyr, endyr)) +# return(c(type, startyr, endyr)) -} +# } -# run linear model on data within year range as specified -# by field prefix and return coefficients -timeRangeTrend <- function (dta, prefix, startyr, endyr, field=NA, IDfield, thresh=0.5) { +# # run linear model on data within year range as specified +# # by field prefix and return coefficients +# timeRangeTrend <- function (dta, prefix, startyr, endyr, field=NA, IDfield, thresh=0.5) { - check <- timeRangeType(colnames(dta), prefix, startyr, endyr, field) +# check <- timeRangeType(colnames(dta), prefix, startyr, endyr, field) - type = check[1] - startyr = check[2] - endyr = check[3] +# type = check[1] +# startyr = check[2] +# endyr = check[3] - if (type == "range") { +# if (type == "range") { - output <- timeRangeTrend_calc(dta, prefix, startyr, tmp_endyr, IDfield, thresh=0.5) +# output <- timeRangeTrend_calc(dta, prefix, startyr, tmp_endyr, IDfield, thresh=0.5) - } else if (type == "pre") { +# } else if (type == "pre") { - output <- apply(dta, 1, function (row) { +# output <- apply(dta, 1, function (row) { - tmp_endyr <- as.integer(row['start_actual_isodate']) +# tmp_endyr <- as.integer(row['start_actual_isodate']) - if (is.na(tmp_endyr) || start_yr >= tmp_endyr) { - return(as.integer("NA")) +# if (is.na(tmp_endyr) || start_yr >= tmp_endyr) { +# return(as.integer("NA")) - } else { - return(timeRangeTrend_calc(row, prefix, startyr, tmp_endyr, IDfield, thresh=0.5, field="start_actual_isodate")) +# } else { +# return(timeRangeTrend_calc(row, prefix, startyr, tmp_endyr, IDfield, thresh=0.5, field="start_actual_isodate")) - } +# } - }) +# }) - } else if (type == "post") { +# } else if (type == "post") { - output <- lapply(dta, function (row) { +# output <- lapply(dta, function (row) { - tmp_startyr <- as.integer(row['start_actual_isodate']) +# tmp_startyr <- as.integer(row['start_actual_isodate']) - if (is.na(tmp_startyr) || tmp_startyr >= endyr) { - return(as.integer("NA")) +# if (is.na(tmp_startyr) || tmp_startyr >= endyr) { +# return(as.integer("NA")) - } else { - return(timeRangeTrend_calc(row, prefix, tmp_startyr, endyr, IDfield, thresh=0.5, field="start_actual_isodate")) +# } else { +# return(timeRangeTrend_calc(row, prefix, tmp_startyr, endyr, IDfield, thresh=0.5, field="start_actual_isodate")) - } +# } - }) +# }) - } else if (type == "invalid") { - output <- 1 +# } else if (type == "invalid") { +# output <- 1 - } else { - output <- 2 +# } else { +# output <- 2 - } +# } - return(output) +# return(output) -} +# } -timeRangeTrend_calc <- function (dta, prefix, startyr, endyr, IDfield, thresh=0.5) { +# timeRangeTrend_calc <- function (dta, prefix, startyr, endyr, IDfield, thresh=0.5) { - # create new dataframe from all columns in dta dataframe that - # are either the ID or a year which is indicated by the prefix - grep_str = paste(IDfield, prefix, sep="|") - tDF <- dta@data[grepl(grep_str, names(dta@data))] +# # create new dataframe from all columns in dta dataframe that +# # are either the ID or a year which is indicated by the prefix +# grep_str = paste(IDfield, prefix, sep="|") +# tDF <- dta@data[grepl(grep_str, names(dta@data))] - # melt all years columns in new dataframe - analysisDF <- melt(tDF, id=c(IDfield)) +# # melt all years columns in new dataframe +# analysisDF <- melt(tDF, id=c(IDfield)) - # cleaned GREP - remove year digit placeholders - # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) +# # cleaned GREP - remove year digit placeholders +# # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) - # get location of year in prefix - yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) +# # get location of year in prefix +# yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) - # generate new year field by removing prefix from variable (original column names) - # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) - analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) { - as.numeric(substr(x, yIndex[1], yIndex[1]+3)) - }) +# # generate new year field by removing prefix from variable (original column names) +# # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) +# analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) { +# as.numeric(substr(x, yIndex[1], yIndex[1]+3)) +# }) - # keep years in range specified - analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] - analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] +# # keep years in range specified +# analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] +# analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] - # create empty field - dta@data["newfieldID"] <- 0 +# # create empty field +# dta@data["newfieldID"] <- 0 - # iterate over original dataframe - for (i in 1:length(dta)) { - # get id for row (in original data) - ID <- as.character(dta@data[IDfield][i,]) +# # iterate over original dataframe +# for (i in 1:length(dta)) { +# # get id for row (in original data) +# ID <- as.character(dta@data[IDfield][i,]) - # get all data corresponding to id from analysis dataframe - ID_dat <- analysisDF[analysisDF[IDfield] == ID,] +# # get all data corresponding to id from analysis dataframe +# ID_dat <- analysisDF[analysisDF[IDfield] == ID,] - dat_length <-length(ID_dat) - count_na <-sum(is.na(ID_dat[['value']])) - count_non_na <- dat_length - count_na - percent_na <- count_na / dat_length +# dat_length <-length(ID_dat) +# count_na <-sum(is.na(ID_dat[['value']])) +# count_non_na <- dat_length - count_na +# percent_na <- count_na / dat_length - # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA - if (percent_na > thresh || count_non_na < 2) { +# # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA +# if (percent_na > thresh || count_non_na < 2) { - dta@data["newfieldID"][i,] <- NA +# dta@data["newfieldID"][i,] <- NA - } else { - # fit trend model - trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) +# } else { +# # fit trend model +# trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) - # add trend coefficients to new field - dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] - } +# # add trend coefficients to new field +# dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] +# } - } +# } - # return new field with trend coefficients - return(dta[["newfieldID"]]) +# # return new field with trend coefficients +# return(dta[["newfieldID"]]) -} +# } -timeRangeAvg <- function (dta, prefix, startyr, endyr, field=NA) { +# timeRangeAvg <- function (dta, prefix, startyr, endyr, field=NA) { - check <- timeRangeType(colnames(dta), prefix, startyr, endyr, field) +# check <- timeRangeType(colnames(dta), prefix, startyr, endyr, field) - type = check[1] - startyr = check[2] - endyr = check[3] +# type = check[1] +# startyr = check[2] +# endyr = check[3] - if (type == "range") { +# if (type == "range") { - output <- timeRangeAvg_calc(dta, prefix, startyr, endyr) +# output <- timeRangeAvg_calc(dta, prefix, startyr, endyr) - } else if (type == "pre") { +# } else if (type == "pre") { - output <- apply(dta, 1, function (row) { +# output <- apply(dta, 1, function (row) { - tmp_endyr <- as.integer(row['start_actual_isodate']) +# tmp_endyr <- as.integer(row['start_actual_isodate']) - if (is.na(tmp_endyr) || start_yr >= tmp_endyr) { - return(as.integer("NA")) +# if (is.na(tmp_endyr) || start_yr >= tmp_endyr) { +# return(as.integer("NA")) - } else { - return(timeRangeAvg_calc(row, prefix, startyr, tmp_endyr)) +# } else { +# return(timeRangeAvg_calc(row, prefix, startyr, tmp_endyr)) - } +# } - }) +# }) - } else if (type == "post") { +# } else if (type == "post") { - output <- lapply(dta, function (row) { +# output <- lapply(dta, function (row) { - tmp_startyr <- as.integer(row['start_actual_isodate']) +# tmp_startyr <- as.integer(row['start_actual_isodate']) - if (is.na(tmp_startyr) || tmp_startyr >= endyr) { - return(as.integer("NA")) +# if (is.na(tmp_startyr) || tmp_startyr >= endyr) { +# return(as.integer("NA")) - } else { - return(timeRangeAvg_calc(row, prefix, tmp_startyr, endyr)) +# } else { +# return(timeRangeAvg_calc(row, prefix, tmp_startyr, endyr)) - } +# } - }) +# }) - } else if (type == "invalid") { - output <- 1 +# } else if (type == "invalid") { +# output <- 1 - } else { - output <- 2 +# } else { +# output <- 2 - } +# } - return(output) +# return(output) -} +# } -timeRangeAvg_calc <- function (dta, prefix, startyr, endyr) { +# timeRangeAvg_calc <- function (dta, prefix, startyr, endyr) { - range <- c(startyr:endyr) - search <- paste("^",prefix,"(",paste(range, collapse="|"),")", sep="") - matches <- grepl(search, colnames(dta)) - rmean <- rowMeans(dta[matches], na.rm=FALSE) +# range <- c(startyr:endyr) +# search <- paste("^",prefix,"(",paste(range, collapse="|"),")", sep="") +# matches <- grepl(search, colnames(dta)) +# rmean <- rowMeans(dta[matches], na.rm=FALSE) - return(rmean) +# return(rmean) -} +# } From 3a1313d6c19a8cf11859591e63f326b14f65342d Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 14 Oct 2015 10:00:52 -0400 Subject: [PATCH 121/212] Fix empty check on SAT constraints. --- R/SAT.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index f0ae8ce..3c85cd5 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -11,7 +11,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat1") - if (!is.null(constraints)) { + if (!is.null(constraints) && contraints != c()) { for (cst in 1:length(names(constraints))) { if (names(constraints)[cst] == "groups") { dta@data[,"ConstraintGroupSet_Opt"] <- dta@data[,constraints["groups"]] From d6b02a5576622fe44b9986c8bb7a398d47b38428 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 14 Oct 2015 11:21:07 -0400 Subject: [PATCH 122/212] Add spatialcausalpsm drop option back in. Remove some debug prints. --- R/SpatialCausalPSM.R | 4 ++-- R/fastNN_binary_func.R | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 992a39a..44b6a41 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -27,7 +27,7 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { # Second, if a drop parameter - if set to "support", remove observations # that don't overlap in the PSM distribution. - #if (drop == "support") { + if (drop == "support") { # Drop treated <- retData@data[retData@data[,"TrtBin"] == 1,] @@ -39,7 +39,7 @@ SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { retData <- retData[retData@data[,"PSM_trtProb"] >= min_cut,] retData <- retData[retData@data[,"PSM_trtProb"] <= max_cut,] - #} + } if (visual == "TRUE") { # Post drop histograms diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 67e9b3c..908c57d 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -7,7 +7,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - print("nn1.0") + # print("nn1.0") #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. @@ -24,7 +24,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 - print("nn2.0") + # print("nn2.0") #Calculate a distance decay function #to perturb pairs based on their distances. @@ -68,7 +68,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } - print("nn3.0") + # print("nn3.0") #Add the matched treatment and control values to the recording data frame #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. @@ -96,7 +96,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j,sep="") - print("nn4.0") + # print("nn4.0") #Add the Treatment ID to the Control Row dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID From 7867fc35c365700dae26db56130948ed3f7621c1 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 14 Oct 2015 11:37:41 -0400 Subject: [PATCH 123/212] Test. --- R/SAT.R | 2 +- R/fastNN_binary_func.R | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 3c85cd5..46c31fb 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -67,7 +67,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } else { t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] - + cnt = cnt + 1 } } diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 908c57d..67e9b3c 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -7,7 +7,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - # print("nn1.0") + print("nn1.0") #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. @@ -24,7 +24,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 - # print("nn2.0") + print("nn2.0") #Calculate a distance decay function #to perturb pairs based on their distances. @@ -68,7 +68,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } - # print("nn3.0") + print("nn3.0") #Add the matched treatment and control values to the recording data frame #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. @@ -96,7 +96,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j,sep="") - # print("nn4.0") + print("nn4.0") #Add the Treatment ID to the Control Row dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID From 6b6a215141f56505acade0c61a7af8ae6acd1af8 Mon Sep 17 00:00:00 2001 From: userz Date: Wed, 14 Oct 2015 11:39:09 -0400 Subject: [PATCH 124/212] Test. --- R/fastNN_binary_func.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 67e9b3c..3c58659 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -32,10 +32,14 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] untreated <- sorted_dta[sorted_dta[[trtMntVar]] ==0,] + + print("nn2.1") #Run the KNN for all neighbors. k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + print("nn2.2") + #Perturb the values based on the distance decay function, if selected. if (!is.null(dist_PSM)) { for (mC in 1:length(k[[1]])) { @@ -57,9 +61,12 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { x_dist = abs(tCoord[1] - tCoord[2]) euc_dist = sqrt(y_dist^2 + x_dist^2) + print("nn2.3") + PSM_score = k[["nn.dist"]][mC] geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + print("nn2.4") k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) From 85a4e30c6509430519574847e5bd7957dd8316e0 Mon Sep 17 00:00:00 2001 From: DanRunfola Date: Fri, 16 Oct 2015 14:05:58 -0400 Subject: [PATCH 125/212] Testing edits to the BuildTimeSeries module to fix panel null error --- R/BuildTimeSeries.R | 2 +- SCI_PanelFix.Rproj | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 SCI_PanelFix.Rproj diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 0f510fc..900a823 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -30,7 +30,7 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } } - print(interpYears) + View(dta) diff --git a/SCI_PanelFix.Rproj b/SCI_PanelFix.Rproj new file mode 100644 index 0000000..a4dce49 --- /dev/null +++ b/SCI_PanelFix.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source From 34749957111b45e85d7326c9b84d3ab6e8f5c3e5 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:06:12 -0400 Subject: [PATCH 126/212] Update NN function use. --- R/SAT.R | 13 ++++++++----- R/fastNN_binary_func.R | 28 +++++++++++++++++++--------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 46c31fb..e4ecd01 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -96,16 +96,17 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) } } + print("sat4") #Build the final datasets from subsets if (cnt > 1) { - dta <- temp_dta[[1]] - for(k in 2:cnt) { - dta <- maptools::spRbind(dta, temp_dta[[k]]) - } + dta <- temp_dta[[1]] + for(k in 2:cnt) { + dta <- maptools::spRbind(dta, temp_dta[[k]]) + } } else { - dta <- temp_dta[[1]] + dta <- temp_dta[[1]] } print("sat5") @@ -130,6 +131,8 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo dta <- dta[dta@data[,"PSM_distance"] < psm_sd_thresh,] } + + #Plot the pre and post-dropping balance for PSM model... #Balance metrics are based on "Misunderstandings between experimentalists and #observationalists about causal inference", Imal, King, and Stuart. diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 3c58659..6eb4639 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -24,14 +24,18 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 - print("nn2.0") + dta@data[["nn_matched"]] <- 0 + + print("nn2") #Calculate a distance decay function #to perturb pairs based on their distances. for (j in 1:it_cnt) { - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] - untreated <- sorted_dta[sorted_dta[[trtMntVar]] ==0,] + print("nn2.0") + + treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] print("nn2.1") @@ -43,6 +47,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Perturb the values based on the distance decay function, if selected. if (!is.null(dist_PSM)) { for (mC in 1:length(k[[1]])) { + + print("nn2.2.0") #Calculate the Euclidean Distance between pairs Control_ID = toString(untreated[[ids]][[mC]]) @@ -61,12 +67,12 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { x_dist = abs(tCoord[1] - tCoord[2]) euc_dist = sqrt(y_dist^2 + x_dist^2) - print("nn2.3") + print("nn2.2.1") PSM_score = k[["nn.dist"]][mC] geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - print("nn2.4") + print("nn2.2.2") k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) @@ -75,7 +81,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } - print("nn3.0") + print("nn2.3") #Add the matched treatment and control values to the recording data frame #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. @@ -103,7 +109,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j,sep="") - print("nn4.0") + print("nn2.4") #Add the Treatment ID to the Control Row dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID @@ -116,10 +122,14 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]][which(dta@data[[ids]] == Treatment_ID)] = k[["nn.dist"]][,1][best_m_control] dta@data[["PSM_match_ID"]][which(dta@data[[ids]] == Treatment_ID)] = pair_id + print("nn2.5") + #Drop the paired match out of the iteration matrix - sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + sorted_dta[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID),]['nn_matched'] <- 1 + } return(dta) From 2c54d98c479229b2dc221e7513e1213674303042 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:12:32 -0400 Subject: [PATCH 127/212] Test --- R/BuildTimeSeries.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index 900a823..f0c26bc 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -30,8 +30,8 @@ BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colY } } - View(dta) - + # View(dta) + # print(interpYears) From 3807331cb263abbca7864ef9a2e7ea5fed395a79 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:16:15 -0400 Subject: [PATCH 128/212] Test --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 6eb4639..70a5985 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -128,7 +128,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - sorted_dta[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID),]['nn_matched'] <- 1 + sorted_dta[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID),][['nn_matched']] <- 1 } From 4d39760bddd95730d43505cd287e136442bfc869 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:17:59 -0400 Subject: [PATCH 129/212] Test --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 70a5985..1f34c03 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -128,7 +128,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - sorted_dta[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 } From c983349577f629c4ec9cbf8744bc7eaa4ca75990 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:20:00 -0400 Subject: [PATCH 130/212] Test --- R/fastNN_binary_func.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 1f34c03..b096900 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -14,6 +14,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]),] + sorted_dta[["nn_matched"]] <- 0 + #Conduct the matching treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] @@ -24,7 +26,6 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 - dta@data[["nn_matched"]] <- 0 print("nn2") From 8f69f06cb8ebdd3373f5a84d987949ecffe8f39e Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:28:01 -0400 Subject: [PATCH 131/212] Test --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index b096900..55ac80d 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -12,7 +12,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]),] + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, "PSM_trtProb")] sorted_dta[["nn_matched"]] <- 0 From 64d39e4cf5964e5435ca6ce9eb23e329bd627557 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:29:52 -0400 Subject: [PATCH 132/212] Test --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 55ac80d..badef5f 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -12,7 +12,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, "PSM_trtProb")] + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] sorted_dta[["nn_matched"]] <- 0 From 90e136a09a796df600055e42c554b316256dcd42 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:32:21 -0400 Subject: [PATCH 133/212] Test --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index badef5f..1fb5c0a 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -13,7 +13,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - + return(sorted_dta) sorted_dta[["nn_matched"]] <- 0 From 0f914bd1f64a2f1a16600783507ef18a64a38e41 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:36:54 -0400 Subject: [PATCH 134/212] Test --- R/fastNN_binary_func.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 1fb5c0a..6fa4ade 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -12,9 +12,10 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - return(sorted_dta) - sorted_dta[["nn_matched"]] <- 0 + dta@data[["nn_matched"]] <- 0 + + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb", "nn_matched"])] + #Conduct the matching From 417ec95f617e118bf4e3665d96328c70392cbfde Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:37:16 -0400 Subject: [PATCH 135/212] Test --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 6fa4ade..52e0304 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -14,7 +14,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. dta@data[["nn_matched"]] <- 0 - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb", "nn_matched"])] + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb", "nn_matched")] From 98b0f86cb651d2fab27ada0a403029b02bef7f71 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:43:56 -0400 Subject: [PATCH 136/212] Test --- R/fastNN_binary_func.R | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 52e0304..1205d5e 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -9,6 +9,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn1.0") + #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. @@ -27,24 +28,29 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 - print("nn2") #Calculate a distance decay function #to perturb pairs based on their distances. for (j in 1:it_cnt) { + time_list <- c() print("nn2.0") + timer <- proc.time() treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + time_list[1] <- (proc.time() - timer)[3] print("nn2.1") + timer <- proc.time() #Run the KNN for all neighbors. k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + time_list[2] <- (proc.time() - timer)[3] print("nn2.2") + timer <- proc.time() #Perturb the values based on the distance decay function, if selected. if (!is.null(dist_PSM)) { @@ -83,7 +89,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } + time_list[3] <- (proc.time() - timer)[3] print("nn2.3") + timer <- proc.time() #Add the matched treatment and control values to the recording data frame #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. @@ -111,7 +119,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j,sep="") + time_list[4] <- (proc.time() - timer)[3] print("nn2.4") + timer <- proc.time() #Add the Treatment ID to the Control Row dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID @@ -124,7 +134,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]][which(dta@data[[ids]] == Treatment_ID)] = k[["nn.dist"]][,1][best_m_control] dta@data[["PSM_match_ID"]][which(dta@data[[ids]] == Treatment_ID)] = pair_id + time_list[5] <- (proc.time() - timer)[3] print("nn2.5") + timer <- proc.time() #Drop the paired match out of the iteration matrix # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] @@ -132,6 +144,10 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + time_list[6] <- (proc.time() - timer)[3] + + print(paste(time_list)) + } return(dta) From e3e82558dc871588fd0ce899356f8c8a93c69b72 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:44:55 -0400 Subject: [PATCH 137/212] Test --- R/fastNN_binary_func.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 1205d5e..3149919 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -35,21 +35,21 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { for (j in 1:it_cnt) { time_list <- c() - print("nn2.0") + # print("nn2.0") timer <- proc.time() treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] time_list[1] <- (proc.time() - timer)[3] - print("nn2.1") + # print("nn2.1") timer <- proc.time() #Run the KNN for all neighbors. k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) time_list[2] <- (proc.time() - timer)[3] - print("nn2.2") + # print("nn2.2") timer <- proc.time() #Perturb the values based on the distance decay function, if selected. @@ -90,7 +90,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } time_list[3] <- (proc.time() - timer)[3] - print("nn2.3") + # print("nn2.3") timer <- proc.time() #Add the matched treatment and control values to the recording data frame @@ -120,7 +120,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { pair_id = paste(curgrp,j,sep="") time_list[4] <- (proc.time() - timer)[3] - print("nn2.4") + # print("nn2.4") timer <- proc.time() #Add the Treatment ID to the Control Row @@ -135,7 +135,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_match_ID"]][which(dta@data[[ids]] == Treatment_ID)] = pair_id time_list[5] <- (proc.time() - timer)[3] - print("nn2.5") + # print("nn2.5") timer <- proc.time() #Drop the paired match out of the iteration matrix From cb72f9242280386b13a023c86851470cc6ce4e3e Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:50:16 -0400 Subject: [PATCH 138/212] Test --- R/fastNN_binary_func.R | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 3149919..e797384 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -41,14 +41,14 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - time_list[1] <- (proc.time() - timer)[3] + time_list[1] <- round((proc.time() - timer)[3],5) # print("nn2.1") timer <- proc.time() #Run the KNN for all neighbors. k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - time_list[2] <- (proc.time() - timer)[3] + time_list[2] <- round((proc.time() - timer)[3],5) # print("nn2.2") timer <- proc.time() @@ -89,7 +89,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } - time_list[3] <- (proc.time() - timer)[3] + time_list[3] <- round((proc.time() - timer)[3],5) # print("nn2.3") timer <- proc.time() @@ -119,22 +119,20 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j,sep="") - time_list[4] <- (proc.time() - timer)[3] + time_list[4] <- round((proc.time() - timer)[3],5) # print("nn2.4") timer <- proc.time() - #Add the Treatment ID to the Control Row + #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID - dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID)] = k[["nn.dist"]][,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID)] = pair_id - - - #Add the Control ID to the Treatment Row - dta@data[["match"]][which(dta@data[[ids]] == Treatment_ID)] = Control_ID - dta@data[["PSM_distance"]][which(dta@data[[ids]] == Treatment_ID)] = k[["nn.dist"]][,1][best_m_control] - dta@data[["PSM_match_ID"]][which(dta@data[[ids]] == Treatment_ID)] = pair_id + dta@data$match[which(dta@data[[ids]] == Treatment_ID)] = Control_ID + + dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + + - time_list[5] <- (proc.time() - timer)[3] + time_list[5] <- round((proc.time() - timer)[3],5) # print("nn2.5") timer <- proc.time() @@ -144,7 +142,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - time_list[6] <- (proc.time() - timer)[3] + time_list[6] <- round((proc.time() - timer)[3],5) print(paste(time_list)) From 868b35799e47efebd8fe5e49a56cc281705e3142 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:53:09 -0400 Subject: [PATCH 139/212] Test --- R/fastNN_binary_func.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index e797384..d19ac0e 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -8,6 +8,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn1.0") + timerx <- proc.time() #Fast nearest neighbors search - will not arrive at optimum, @@ -124,8 +125,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { timer <- proc.time() #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - dta@data$match[which(dta@data[[ids]] == Control_ID)] = Treatment_ID - dta@data$match[which(dta@data[[ids]] == Treatment_ID)] = Control_ID + dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id @@ -148,6 +149,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } + print(proc.time() - timerx)[3]) + return(dta) } From e78a2abf88f8a63a532284ae7aa6f61c36dbe13b Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:53:33 -0400 Subject: [PATCH 140/212] Test --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index d19ac0e..af3276c 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -149,7 +149,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } - print(proc.time() - timerx)[3]) + print((proc.time() - timerx)[3]) return(dta) From 0fd30074cbc553f315b59bb05372a963f9d369e2 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 11:56:14 -0400 Subject: [PATCH 141/212] Test --- R/fastNN_binary_func.R | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index af3276c..dcda985 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -29,35 +29,36 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 - print("nn2") + # print("nn2") #Calculate a distance decay function #to perturb pairs based on their distances. for (j in 1:it_cnt) { - time_list <- c() + # time_list <- c() # print("nn2.0") - timer <- proc.time() + # timer <- proc.time() treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - time_list[1] <- round((proc.time() - timer)[3],5) + # time_list[1] <- round((proc.time() - timer)[3],5) # print("nn2.1") - timer <- proc.time() + # timer <- proc.time() #Run the KNN for all neighbors. k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - time_list[2] <- round((proc.time() - timer)[3],5) + # time_list[2] <- round((proc.time() - timer)[3],5) # print("nn2.2") - timer <- proc.time() + # timer <- proc.time() #Perturb the values based on the distance decay function, if selected. if (!is.null(dist_PSM)) { for (mC in 1:length(k[[1]])) { - print("nn2.2.0") + # print("nn2.2.0") + #Calculate the Euclidean Distance between pairs Control_ID = toString(untreated[[ids]][[mC]]) @@ -76,12 +77,12 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { x_dist = abs(tCoord[1] - tCoord[2]) euc_dist = sqrt(y_dist^2 + x_dist^2) - print("nn2.2.1") + # print("nn2.2.1") PSM_score = k[["nn.dist"]][mC] geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - print("nn2.2.2") + # print("nn2.2.2") k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) @@ -90,9 +91,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } - time_list[3] <- round((proc.time() - timer)[3],5) + # time_list[3] <- round((proc.time() - timer)[3],5) # print("nn2.3") - timer <- proc.time() + # timer <- proc.time() #Add the matched treatment and control values to the recording data frame #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. @@ -120,9 +121,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j,sep="") - time_list[4] <- round((proc.time() - timer)[3],5) + # time_list[4] <- round((proc.time() - timer)[3],5) # print("nn2.4") - timer <- proc.time() + # timer <- proc.time() #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID @@ -133,9 +134,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - time_list[5] <- round((proc.time() - timer)[3],5) + # time_list[5] <- round((proc.time() - timer)[3],5) # print("nn2.5") - timer <- proc.time() + # timer <- proc.time() #Drop the paired match out of the iteration matrix # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] @@ -143,9 +144,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - time_list[6] <- round((proc.time() - timer)[3],5) - - print(paste(time_list)) + # time_list[6] <- round((proc.time() - timer)[3],5) + # print(paste(time_list)) } From 607b759e0d6b285e08e4b069d3bc7b9e7ec1bf9e Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:00:33 -0400 Subject: [PATCH 142/212] Test --- R/fastNN_binary_func.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index dcda985..f17d814 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -42,6 +42,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + print(nrow(sorted_dta)) + # time_list[1] <- round((proc.time() - timer)[3],5) # print("nn2.1") # timer <- proc.time() @@ -58,7 +60,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { for (mC in 1:length(k[[1]])) { # print("nn2.2.0") - + #Calculate the Euclidean Distance between pairs Control_ID = toString(untreated[[ids]][[mC]]) From e1b007838048a9946fd9c277b9af3bf19e1c5f0b Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:01:21 -0400 Subject: [PATCH 143/212] Test --- R/fastNN_binary_func.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index f17d814..beffa40 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -42,8 +42,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - print(nrow(sorted_dta)) - + print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # time_list[1] <- round((proc.time() - timer)[3],5) # print("nn2.1") # timer <- proc.time() From 07c8576ea5d0ae813a38ac965f653d2357e490a1 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:13:41 -0400 Subject: [PATCH 144/212] Test --- R/fastNN_binary_func.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index beffa40..9d414af 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -39,10 +39,10 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # print("nn2.0") # timer <- proc.time() - treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) # time_list[1] <- round((proc.time() - timer)[3],5) # print("nn2.1") @@ -121,7 +121,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j,sep="") + pair_id = paste(curgrp,j, sep="") # time_list[4] <- round((proc.time() - timer)[3],5) # print("nn2.4") @@ -144,8 +144,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + treated <- treated[which(treated[[ids]] != Treatment_ID),] + untreated <- untreated[which(untreated[[ids]] != Control_ID),] + # time_list[6] <- round((proc.time() - timer)[3],5) # print(paste(time_list)) From 344535d2d9c07b315fb707b20564c191b9d4d017 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:17:15 -0400 Subject: [PATCH 145/212] Test --- R/fastNN_binary_func.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 9d414af..86688d6 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -14,10 +14,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - dta@data[["nn_matched"]] <- 0 - - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb", "nn_matched")] + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] #Conduct the matching @@ -110,14 +108,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # #Treatment PSM ID # Treatment_ID = toString(treated[[ids]][[best_m_treated]]) - #Control PSM ID - cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) + # #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) - #Treatment PSM ID - tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) + # #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) + #Control PSM ID and Treatment PSM ID + Control_ID = toString(untreated[,ids][best_m_control]) + Treatment_ID = toString(treated[,ids][best_m_treated]) #Create a unique pair ID for each group (will simply append a "1" if only 1 group) From f21621b6d5195d9bca970a129e897a39109ae774 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:20:01 -0400 Subject: [PATCH 146/212] Test --- R/fastNN_binary_func.R | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 86688d6..20f48c1 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -102,23 +102,18 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #This will give us the matched index in the "treated" dataset. best_m_treated = k[["nn.index"]][best_m_control] - # #Control PSM ID - # Control_ID = toString(untreated[[ids]][[best_m_control]]) - - # #Treatment PSM ID - # Treatment_ID = toString(treated[[ids]][[best_m_treated]]) - - # #Control PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) + + #Control PSM ID + cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + Control_ID = toString(eval(parse(text=cid_txt))) - # #Treatment PSM ID - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) + #Treatment PSM ID + tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + Treatment_ID = toString(eval(parse(text=tid_txt))) #Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[,ids][best_m_control]) - Treatment_ID = toString(treated[,ids][best_m_treated]) + # Control_ID = toString(untreated[,ids][best_m_control]) + # Treatment_ID = toString(treated[,ids][best_m_treated]) #Create a unique pair ID for each group (will simply append a "1" if only 1 group) From 7253a3be12fa18c5ffaaf0f75a6cbe0005f795ab Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:22:09 -0400 Subject: [PATCH 147/212] Test --- R/fastNN_binary_func.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 20f48c1..6a31fdb 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -103,17 +103,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { best_m_treated = k[["nn.index"]][best_m_control] - #Control PSM ID - cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) + # #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) - #Treatment PSM ID - tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) + # #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) #Control PSM ID and Treatment PSM ID - # Control_ID = toString(untreated[,ids][best_m_control]) - # Treatment_ID = toString(treated[,ids][best_m_treated]) + Control_ID = untreated[,ids][best_m_control] + Treatment_ID = treated[,ids][best_m_treated] #Create a unique pair ID for each group (will simply append a "1" if only 1 group) From c8d0833fa07d93dc8ac153ec19b4df2559303de8 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:29:35 -0400 Subject: [PATCH 148/212] Test --- R/fastNN_binary_func.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 6a31fdb..20f48c1 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -103,17 +103,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { best_m_treated = k[["nn.index"]][best_m_control] - # #Control PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) + #Control PSM ID + cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + Control_ID = toString(eval(parse(text=cid_txt))) - # #Treatment PSM ID - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) + #Treatment PSM ID + tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + Treatment_ID = toString(eval(parse(text=tid_txt))) #Control PSM ID and Treatment PSM ID - Control_ID = untreated[,ids][best_m_control] - Treatment_ID = treated[,ids][best_m_treated] + # Control_ID = toString(untreated[,ids][best_m_control]) + # Treatment_ID = toString(treated[,ids][best_m_treated]) #Create a unique pair ID for each group (will simply append a "1" if only 1 group) From 8beeb51c6a2bdb8aa374f6149b7a66a56f3ddfc7 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:52:39 -0400 Subject: [PATCH 149/212] Test --- R/SAT.R | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index e4ecd01..2b87c3f 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -78,22 +78,24 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo temp_dta <- list() - for (i in 1:cnt) { - cur_grp <- grp_list[[i]] - - print("sat3.1") - it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) - - print("sat3.2") - if (mtd == "fastNN") { - # *** - # this is the slow part of functions - temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) - } + if (cnt > 0) { + for (i in 1:cnt) { + cur_grp <- grp_list[[i]] + + print("sat3.1") + it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) + + print("sat3.2") + if (mtd == "fastNN") { + # *** + # this is the slow part of functions + temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + } - if (mtd == "NN_WithReplacement") { - print("NN with replacement is currently not available, please choose fastNN") - # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + if (mtd == "NN_WithReplacement") { + print("NN with replacement is currently not available, please choose fastNN") + # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + } } } From 7c833b64ef98c692711ef4d7dfba41ca2184209b Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 12:55:00 -0400 Subject: [PATCH 150/212] Test --- R/SAT.R | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 2b87c3f..a079503 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -79,6 +79,8 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo temp_dta <- list() if (cnt > 0) { + print("sat3.0") + for (i in 1:cnt) { cur_grp <- grp_list[[i]] @@ -97,21 +99,22 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) } } - } - print("sat4") + print("sat3.1") + + #Build the final datasets from subsets + if (cnt > 1) { + dta <- temp_dta[[1]] + for(k in 2:cnt) { + dta <- maptools::spRbind(dta, temp_dta[[k]]) + } + } else { + dta <- temp_dta[[1]] + } - #Build the final datasets from subsets - if (cnt > 1) { - dta <- temp_dta[[1]] - for(k in 2:cnt) { - dta <- maptools::spRbind(dta, temp_dta[[k]]) - } - } else { - dta <- temp_dta[[1]] } - print("sat5") + print("sat4") if (drop_unmatched == TRUE) { dta <- dta[dta@data[,"PSM_match_ID"] != -999,] @@ -121,7 +124,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) anc_vars <- c(anc_vars[[1]], "PSM_trtProb") - print("sat6") + print("sat5") #Drop observations according to the selected method if (drop_method == "SD") { @@ -141,11 +144,11 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #Simplest suggestion of comparing means and checking if .25 SD apart used. cnt = 0 - print("sat7") + print("sat6") for (i in 1:length(anc_vars)) { - print("sat7.0") + print("sat6.0") #gsub to remove any factors() ed_v = sub("factor\\(","",anc_vars[i]) @@ -153,7 +156,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo c_type = class(init_dta@data[[ed_v]]) - print("sat7.1") + print("sat6.1") if (c_type == "matrix") { dta@data[,ed_v] <- as.numeric(dta@data[,ed_v]) @@ -163,7 +166,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo c_type = "numeric" } - print("sat7.2") + print("sat6.2") if ((c_type == "numeric") & (visual == "TRUE")) { cnt = cnt + 1 pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) @@ -213,7 +216,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } } - print("sat8") + print("sat7") if (visual=="TRUE") { #Output graphics From 8a45f4fa80f3b5723c858686086817939ff4b025 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 13:02:21 -0400 Subject: [PATCH 151/212] Test --- R/SAT.R | 69 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index a079503..c096cd7 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -73,49 +73,48 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } + if (cnt == 0) { + return('drop') + } + print("sat3") temp_dta <- list() - if (cnt > 0) { - print("sat3.0") - - for (i in 1:cnt) { - cur_grp <- grp_list[[i]] - - print("sat3.1") - it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) - - print("sat3.2") - if (mtd == "fastNN") { - # *** - # this is the slow part of functions - temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) - } - - if (mtd == "NN_WithReplacement") { - print("NN with replacement is currently not available, please choose fastNN") - # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) - } - } + for (i in 1:cnt) { + cur_grp <- grp_list[[i]] print("sat3.1") + it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) - #Build the final datasets from subsets - if (cnt > 1) { - dta <- temp_dta[[1]] - for(k in 2:cnt) { - dta <- maptools::spRbind(dta, temp_dta[[k]]) - } - } else { - dta <- temp_dta[[1]] + print("sat3.2") + if (mtd == "fastNN") { + # *** + # this is the slow part of functions + temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) } + if (mtd == "NN_WithReplacement") { + print("NN with replacement is currently not available, please choose fastNN") + # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + } } print("sat4") + #Build the final datasets from subsets + if (cnt > 1) { + dta <- temp_dta[[1]] + for(k in 2:cnt) { + dta <- maptools::spRbind(dta, temp_dta[[k]]) + } + } else { + dta <- temp_dta[[1]] + } + + print("sat5") + if (drop_unmatched == TRUE) { dta <- dta[dta@data[,"PSM_match_ID"] != -999,] } @@ -124,7 +123,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) anc_vars <- c(anc_vars[[1]], "PSM_trtProb") - print("sat5") + print("sat6") #Drop observations according to the selected method if (drop_method == "SD") { @@ -144,11 +143,11 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #Simplest suggestion of comparing means and checking if .25 SD apart used. cnt = 0 - print("sat6") + print("sat7") for (i in 1:length(anc_vars)) { - print("sat6.0") + print("sat7.0") #gsub to remove any factors() ed_v = sub("factor\\(","",anc_vars[i]) @@ -156,7 +155,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo c_type = class(init_dta@data[[ed_v]]) - print("sat6.1") + print("sat7.1") if (c_type == "matrix") { dta@data[,ed_v] <- as.numeric(dta@data[,ed_v]) @@ -166,7 +165,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo c_type = "numeric" } - print("sat6.2") + print("sat7.2") if ((c_type == "numeric") & (visual == "TRUE")) { cnt = cnt + 1 pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) @@ -216,7 +215,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } } - print("sat7") + print("sat8") if (visual=="TRUE") { #Output graphics From 919d1c22261955f7ab087307d6d331835a435c1d Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 19 Oct 2015 13:06:08 -0400 Subject: [PATCH 152/212] Test --- R/SAT.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index c096cd7..3d389dc 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -19,14 +19,16 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } else { dta$ConstraintGroupSet_Opt <- 1 } + if (names(constraints)[cst] == "distance") { dist_PSM = as.numeric(constraints["distance"][[1]]) } else { dist_PSM=NULL } } + } else { - dta$ConstraintGroupSet_Opt <- 1 + dta@data[,"ConstraintGroupSet_Opt"] <- 1 #max the distance threshold by taking the diagonal of the bounding box. dist_PSM = NULL } From c46cdbcc6261a21f5e3e77d1d39073a5a39e05f9 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 3 Nov 2015 13:53:02 -0500 Subject: [PATCH 153/212] Test nn issue. --- R/SAT.R | 2 +- R/fastNN_binary_func.R | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 3d389dc..7396621 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -7,7 +7,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo drop_method = drop_opts["drop_method"] drop_thresh = as.numeric(drop_opts["drop_thresh"]) - + print("sat1") diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 20f48c1..67a1b2b 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -21,7 +21,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Conduct the matching treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] - + print(length(treated[[1]])) + print(length(untreated[[1]])) it_cnt = min(length(treated[[1]]), length(untreated[[1]])) dta@data[["match"]] <- -999 dta@data[["PSM_distance"]] <- -999 From 6b8d97fdedd74eea9eb322bc99d57fea1dfdb70d Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 3 Nov 2015 13:56:07 -0500 Subject: [PATCH 154/212] Test nn issue. --- R/fastNN_binary_func.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 67a1b2b..9a9090d 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -21,8 +21,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Conduct the matching treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] - print(length(treated[[1]])) - print(length(untreated[[1]])) + it_cnt = min(length(treated[[1]]), length(untreated[[1]])) dta@data[["match"]] <- -999 dta@data[["PSM_distance"]] <- -999 @@ -48,6 +47,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # timer <- proc.time() #Run the KNN for all neighbors. + print(length(treated[[1]])) + summary(treated[["PSM_trtProb"]]) + print(length(untreated[[1]])) + summary(untreated[["PSM_trtProb"]]) + k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) # time_list[2] <- round((proc.time() - timer)[3],5) From 6fcc5d61f1d87762f2ac482d2db248d5c80e1a94 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 3 Nov 2015 14:08:36 -0500 Subject: [PATCH 155/212] Test nn issue. --- R/fastNN_binary_func.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 9a9090d..c771e83 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -47,10 +47,10 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # timer <- proc.time() #Run the KNN for all neighbors. - print(length(treated[[1]])) - summary(treated[["PSM_trtProb"]]) - print(length(untreated[[1]])) - summary(untreated[["PSM_trtProb"]]) + # print(length(treated[[1]])) + # summary(treated[["PSM_trtProb"]]) + # print(length(untreated[[1]])) + # summary(untreated[["PSM_trtProb"]]) k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) From 5c33df70aca5077e0d22eaa2b431fb2b62e3dd34 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 6 Nov 2015 11:57:26 -0500 Subject: [PATCH 156/212] test nn --- R/fastNN_binary_func.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index c771e83..40ff9e9 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -27,14 +27,14 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 - # print("nn2") + print("nn2") #Calculate a distance decay function #to perturb pairs based on their distances. for (j in 1:it_cnt) { # time_list <- c() - # print("nn2.0") + print("nn2.0") # timer <- proc.time() # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] @@ -55,14 +55,14 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) # time_list[2] <- round((proc.time() - timer)[3],5) - # print("nn2.2") + print("nn2.2") # timer <- proc.time() #Perturb the values based on the distance decay function, if selected. if (!is.null(dist_PSM)) { for (mC in 1:length(k[[1]])) { - # print("nn2.2.0") + print("nn2.2.0") #Calculate the Euclidean Distance between pairs Control_ID = toString(untreated[[ids]][[mC]]) @@ -82,12 +82,12 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { x_dist = abs(tCoord[1] - tCoord[2]) euc_dist = sqrt(y_dist^2 + x_dist^2) - # print("nn2.2.1") + print("nn2.2.1") PSM_score = k[["nn.dist"]][mC] geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # print("nn2.2.2") + print("nn2.2.2") k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) @@ -97,7 +97,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } # time_list[3] <- round((proc.time() - timer)[3],5) - # print("nn2.3") + print("nn2.3") # timer <- proc.time() #Add the matched treatment and control values to the recording data frame @@ -125,7 +125,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { pair_id = paste(curgrp,j, sep="") # time_list[4] <- round((proc.time() - timer)[3],5) - # print("nn2.4") + print("nn2.4") # timer <- proc.time() #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row @@ -138,7 +138,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # time_list[5] <- round((proc.time() - timer)[3],5) - # print("nn2.5") + print("nn2.5") # timer <- proc.time() #Drop the paired match out of the iteration matrix From 0875a69017e130c7e902b459c93d0ddf1aa91a8c Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 6 Nov 2015 12:00:13 -0500 Subject: [PATCH 157/212] test nn --- R/fastNN_binary_func.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 40ff9e9..9ea8262 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -125,13 +125,15 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { pair_id = paste(curgrp,j, sep="") # time_list[4] <- round((proc.time() - timer)[3],5) - print("nn2.4") + print("nn2.4a") # timer <- proc.time() #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + print("nn2.4b") + dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id From 17b1bb790f2b73a6c541dd2ed4835ca9a19e5971 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 6 Nov 2015 12:02:22 -0500 Subject: [PATCH 158/212] test nn --- R/fastNN_binary_func.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 9ea8262..7b82e46 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -124,15 +124,16 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j, sep="") + + # time_list[4] <- round((proc.time() - timer)[3],5) - print("nn2.4a") + print("nn2.4x") # timer <- proc.time() #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - print("nn2.4b") dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id @@ -140,7 +141,6 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # time_list[5] <- round((proc.time() - timer)[3],5) - print("nn2.5") # timer <- proc.time() #Drop the paired match out of the iteration matrix From 7697483d6ee958cd39db7e0e04d45ff2a95cc3d7 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 6 Nov 2015 12:06:54 -0500 Subject: [PATCH 159/212] test nn --- R/fastNN_binary_func.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 7b82e46..cc8fd83 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -23,8 +23,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - dta@data[["match"]] <- -999 - dta@data[["PSM_distance"]] <- -999 + # dta@data[["match"]] <- -999 + # dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 print("nn2") @@ -131,11 +131,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # timer <- proc.time() #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id From e6076575d378d3c24ca99d7a4be53e97335e2de5 Mon Sep 17 00:00:00 2001 From: userz Date: Mon, 9 Nov 2015 16:20:07 -0500 Subject: [PATCH 160/212] data table test for nn --- R/SAT.R | 249 +++++++++++++++++++++++++++++++++++++++++ R/fastNN_binary_func.R | 208 +++++++++++++++++++++++++++++++--- R/loadLibs.R | 2 + 3 files changed, 441 insertions(+), 18 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 7396621..1f6d06c 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -1,3 +1,252 @@ +# SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinColName) { + +# dta <- as.data.table(dta@data) + +# #Initialization +# pltObjs <- list() +# init_dta <- dta + +# drop_unmatched = drop_opts["drop_unmatched"] +# drop_method = drop_opts["drop_method"] +# drop_thresh = as.numeric(drop_opts["drop_thresh"]) + + + +# print("sat1") + +# if (!is.null(constraints) && contraints != c()) { +# for (cst in 1:length(names(constraints))) { +# if (names(constraints)[cst] == "groups") { +# dta[,"ConstraintGroupSet_Opt"] <- dta[,get(constraints["groups"])] + +# } else { +# dta[,"ConstraintGroupSet_Opt"] <- 1 +# } + +# if (names(constraints)[cst] == "distance") { +# dist_PSM = as.numeric(constraints["distance"][[1]]) +# } else { +# dist_PSM=NULL +# } +# } + +# } else { +# dta[,"ConstraintGroupSet_Opt"] <- 1 +# #max the distance threshold by taking the diagonal of the bounding box. +# dist_PSM = NULL +# } + + + +# print("sat2") + +# #Caclulate the number of groups to constrain by, if any. +# group_constraints <- unique(dta[,'ConstraintGroupSet_Opt'] + +# #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) +# t_dta <- list() +# u_dta <-list() +# grp_list <- list() +# cnt = 0 + +# for (grp in 1:length(group_constraints)) { +# cur_grp <- as.matrix(group_constraints)[grp] +# grp_index = length(grp_list)+1 +# t_index = length(t_dta)+1 +# grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] + +# t_dta[[t_index]] <- dta[TrtBin == 1] +# u_dta[[t_index]] <- dta[TrtBin == 0] + +# has_treated <- cur_grp %in% t_dta[[t_index]][,'ConstraintGroupSet_Opt'] +# has_untreated <- cur_grp %in% u_dta[[t_index]][,'ConstraintGroupSet_Opt'] + +# if ((has_untreated == FALSE) || (has_treated == FALSE)) { +# dta <- dta['ConstraintGroupSet_Opt' != cur_grp] +# t_dta[[t_index]] <- NULL +# u_dta[[t_index]] <- NULL +# grp_list[[t_index]] <- NULL +# war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") +# warning(war_statement) + +# } else { +# t_dta[[t_index]] <- t_dta[[t_index]][ConstraintGroupSet_Opt == (cur_grp)] +# u_dta[[t_index]] <- u_dta[[t_index]][ConstraintGroupSet_Opt == (cur_grp)] + +# cnt = cnt + 1 +# } +# } + + +# if (cnt == 0) { +# return('drop') +# } + + +# print("sat3") + +# temp_dta <- list() + +# for (i in 1:cnt) { +# cur_grp <- grp_list[[i]] + +# print("sat3.1") +# it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) + +# print("sat3.2") +# if (mtd == "fastNN") { +# # *** +# # this is the slow part of functions +# temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) +# } + +# if (mtd == "NN_WithReplacement") { +# print("NN with replacement is currently not available, please choose fastNN") +# # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) +# } +# } + +# print("sat4") + +# #Build the final datasets from subsets +# if (cnt > 1) { +# dta <- temp_dta[[1]] +# for(k in 2:cnt) { +# dta <- maptools::spRbind(dta, temp_dta[[k]]) +# } +# } else { +# dta <- temp_dta[[1]] +# } + +# print("sat5") + +# if (drop_unmatched == TRUE) { +# dta <- dta["PSM_match_ID" != -999] +# } + +# anc_v_int <- strsplit(psm_eq, "~")[[1]][2] +# anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) +# anc_vars <- c(anc_vars[[1]], "PSM_trtProb") + +# print("sat6") + +# #Drop observations according to the selected method +# if (drop_method == "SD") { +# #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. +# psm_sd_thresh = sd(dta[,"PSM_trtProb"]) * drop_thresh +# if (visual == "TRUE") { +# print(psm_sd_thresh) +# } +# dta <- dta["PSM_distance" < psm_sd_thresh] +# } + + + +# #Plot the pre and post-dropping balance for PSM model... +# #Balance metrics are based on "Misunderstandings between experimentalists and +# #observationalists about causal inference", Imal, King, and Stuart. +# #Simplest suggestion of comparing means and checking if .25 SD apart used. +# cnt = 0 + +# print("sat7") + +# for (i in 1:length(anc_vars)) { + +# print("sat7.0") + +# #gsub to remove any factors() +# ed_v = sub("factor\\(","",anc_vars[i]) +# ed_v = sub(")","",ed_v) + +# c_type = class(init_dta[[ed_v]]) + +# print("sat7.1") +# if (c_type == "matrix") { + +# dta[,ed_v] <- as.numeric(dta[,ed_v]) + +# init_dta[,ed_v] <- as.numeric(init_dta[,ed_v]) + +# c_type = "numeric" +# } + +# print("sat7.2") +# if ((c_type == "numeric") & (visual == "TRUE")) { +# cnt = cnt + 1 +# pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) +# pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) + + +# treat_mean_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[2]][[3]], 5) +# treat_SD_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[2]][[4]], 5) +# control_mean_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[1]][[3]], 5) +# control_SD_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[1]][[4]], 5) + +# treat_mean_post <- round(describeBy(dta[[ed_v]], group=dta[[TrtBinColName]])[[2]][[3]], 5) +# treat_SD_post <- round(describeBy(dta[[ed_v]], group=dta[[TrtBinColName]])[[2]][[4]], 5) +# control_mean_post <- round(describeBy(dta[[ed_v]], group=dta[[TrtBinColName]])[[1]][[3]], 5) +# control_SD_post <- round(describeBy(dta[[ed_v]], group=dta[[TrtBinColName]])[[1]][[4]], 5) + + +# it_diff_Mean_pre <- round(abs( treat_mean_pre-control_mean_pre ),5) +# it_diff_Mean_post <- round(abs(treat_mean_post-control_mean_post),5) + +# if (!exists("bRes")) { + +# bRes <- data.frame(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, +# treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, +# it_diff_Mean_pre,it_diff_Mean_post) + +# colnames(bRes)[1] <- "Pre-Balance Treated Mean" +# colnames(bRes)[2] <- "Pre-Balance Treated SD" +# colnames(bRes)[3] <- "Pre-Balance Control Mean" +# colnames(bRes)[4] <- "Pre-Balance Control SD" + +# colnames(bRes)[5] <- "Post-Balance Treated Mean" +# colnames(bRes)[6] <- "Post-Balance Treated SD" +# colnames(bRes)[7] <- "Post-Balance Control Mean" +# colnames(bRes)[8] <- "Post-Balance Control SD" + +# colnames(bRes)[9] <- "Mean Difference Pre-Balance" +# colnames(bRes)[10] <- "Mean Difference Post-Balance" + +# } else { +# bRes <- rbind(bRes, c(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, +# treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, +# it_diff_Mean_pre,it_diff_Mean_post)) +# } + +# rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]", "", ed_v) +# } +# } + +# print("sat8") + +# if (visual=="TRUE") { +# #Output graphics +# #Remove the factor rows +# nrow_c <- length(pltObjs) +# counter <- 1 +# while (counter <= nrow_c) { +# d = counter + 3 +# if (d > nrow_c) { +# d = nrow_c +# } +# do.call(grid.arrange,c(pltObjs[counter:d],nrow=2,ncol=2)) +# counter = counter + 4 +# } +# #bTab <- stargazer(bRes,summary=FALSE,type="html") +# #print.htmlTable(bTab) +# } + + +# return (as.data.frame(dta)) +# } + + + + + SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinColName) { #Initialization pltObjs <- list() diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index cc8fd83..e4d649f 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -8,19 +8,21 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn1.0") - timerx <- proc.time() + # timerx <- proc.time() + #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + sorted_dta <- as.data.table(dta@data) + sorted_dta <- sorted_dta[order("PSM_trtProb")][, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] #Conduct the matching - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] - untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] + treated <- sorted_dta[get(trtMntVar) == 1] + untreated <- sorted_dta[get(trtMntVar) == 0] it_cnt = min(length(treated[[1]]), length(untreated[[1]])) # dta@data[["match"]] <- -999 @@ -52,7 +54,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # print(length(untreated[[1]])) # summary(untreated[["PSM_trtProb"]]) - k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + k <- get.knnx(treated[, PSM_trtProb], untreated[, PSM_trtProb], 1) # time_list[2] <- round((proc.time() - timer)[3],5) print("nn2.2") @@ -65,11 +67,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn2.2.0") #Calculate the Euclidean Distance between pairs - Control_ID = toString(untreated[[ids]][[mC]]) + Control_ID = toString(untreated[mC, get(ids)]) mT = k[["nn.index"]][mC] - Treatment_ID = toString(treated[[ids]][[mT]]) + Treatment_ID = toString(treated[mT, get(ids)]) #Find the control x,y location cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) @@ -108,17 +110,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { best_m_treated = k[["nn.index"]][best_m_control] - #Control PSM ID - cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) + #Control and Treatment PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) - #Treatment PSM ID - tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) + #Control PSM ID and Treatment PSM ID - # Control_ID = toString(untreated[,ids][best_m_control]) - # Treatment_ID = toString(treated[,ids][best_m_treated]) + Control_ID = toString(untreated[best_m_control, get(ids)]) + Treatment_ID = toString(treated[best_m_control, get(ids)]) #Create a unique pair ID for each group (will simply append a "1" if only 1 group) @@ -149,17 +151,187 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - treated <- treated[which(treated[[ids]] != Treatment_ID),] - untreated <- untreated[which(untreated[[ids]] != Control_ID),] + treated <- treated[get(ids) != (Treatment_ID)] + untreated <- untreated[get(ids) != (Control_ID)] # time_list[6] <- round((proc.time() - timer)[3],5) # print(paste(time_list)) } - print((proc.time() - timerx)[3]) + # print((proc.time() - timerx)[3]) return(dta) } + + + + +# #FastNN +# #Algorithm to find a hopefully near-optimal match of pairs +# #In a treatment and control group +# #Works by first ordering by the propensity score matching value, +# #and then working through this list in order from highest to lowest. +# #Matches are removed each step. + +# fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { + +# print("nn1.0") +# timerx <- proc.time() + + + +# #Fast nearest neighbors search - will not arrive at optimum, +# #but this may not be an issue for many analysis. +# #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + +# sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + + +# #Conduct the matching +# treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] +# untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] + +# it_cnt = min(length(treated[[1]]), length(untreated[[1]])) +# # dta@data[["match"]] <- -999 +# # dta@data[["PSM_distance"]] <- -999 +# dta@data[["PSM_match_ID"]] <- -999 + +# print("nn2") + +# #Calculate a distance decay function +# #to perturb pairs based on their distances. +# for (j in 1:it_cnt) { +# # time_list <- c() + +# print("nn2.0") +# # timer <- proc.time() + +# # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] +# # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + +# # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + +# # time_list[1] <- round((proc.time() - timer)[3],5) +# # print("nn2.1") +# # timer <- proc.time() + +# #Run the KNN for all neighbors. +# # print(length(treated[[1]])) +# # summary(treated[["PSM_trtProb"]]) +# # print(length(untreated[[1]])) +# # summary(untreated[["PSM_trtProb"]]) + +# k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + +# # time_list[2] <- round((proc.time() - timer)[3],5) +# print("nn2.2") +# # timer <- proc.time() + +# #Perturb the values based on the distance decay function, if selected. +# if (!is.null(dist_PSM)) { +# for (mC in 1:length(k[[1]])) { + +# print("nn2.2.0") + +# #Calculate the Euclidean Distance between pairs +# Control_ID = toString(untreated[[ids]][[mC]]) + +# mT = k[["nn.index"]][mC] + +# Treatment_ID = toString(treated[[ids]][[mT]]) + +# #Find the control x,y location +# cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + + +# #Find the treatment x,y location +# tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + +# y_dist = abs(cCoord[1] - cCoord[2]) +# x_dist = abs(tCoord[1] - tCoord[2]) +# euc_dist = sqrt(y_dist^2 + x_dist^2) + +# print("nn2.2.1") + +# PSM_score = k[["nn.dist"]][mC] +# geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + +# print("nn2.2.2") + + +# k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + +# } + +# } + +# # time_list[3] <- round((proc.time() - timer)[3],5) +# print("nn2.3") +# # timer <- proc.time() + +# #Add the matched treatment and control values to the recording data frame +# #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. +# best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + +# #This will give us the matched index in the "treated" dataset. +# best_m_treated = k[["nn.index"]][best_m_control] + + +# #Control PSM ID +# cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") +# Control_ID = toString(eval(parse(text=cid_txt))) + +# #Treatment PSM ID +# tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") +# Treatment_ID = toString(eval(parse(text=tid_txt))) + +# #Control PSM ID and Treatment PSM ID +# # Control_ID = toString(untreated[,ids][best_m_control]) +# # Treatment_ID = toString(treated[,ids][best_m_treated]) + + +# #Create a unique pair ID for each group (will simply append a "1" if only 1 group) +# pair_id = paste(curgrp,j, sep="") + + + +# # time_list[4] <- round((proc.time() - timer)[3],5) +# print("nn2.4x") +# # timer <- proc.time() + +# #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row +# # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID +# # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + + +# # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] +# dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + + + +# # time_list[5] <- round((proc.time() - timer)[3],5) +# # timer <- proc.time() + +# #Drop the paired match out of the iteration matrix +# # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] +# # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + +# # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + +# treated <- treated[which(treated[[ids]] != Treatment_ID),] +# untreated <- untreated[which(untreated[[ids]] != Control_ID),] + +# # time_list[6] <- round((proc.time() - timer)[3],5) +# # print(paste(time_list)) + +# } + +# print((proc.time() - timerx)[3]) + +# return(dta) + +# } + diff --git a/R/loadLibs.R b/R/loadLibs.R index 2eb2d52..cff4216 100644 --- a/R/loadLibs.R +++ b/R/loadLibs.R @@ -19,4 +19,6 @@ loadLibs <- function (x=1) { library(stargazer) library(lmtest) library(multiwayvcov) + + library(data.table) } From 77a292ed80618883d22664fb908e1178e762616b Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 09:16:30 -0500 Subject: [PATCH 161/212] data table test for nn --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index e4d649f..ecf56ee 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -17,7 +17,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. sorted_dta <- as.data.table(dta@data) - sorted_dta <- sorted_dta[order("PSM_trtProb")][, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] + sorted_dta <- sorted_dta[order(PSM_trtProb)][, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] #Conduct the matching From db5cb1183108fbaa7fa776feb551048f83f8b5ab Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 09:20:58 -0500 Subject: [PATCH 162/212] data table test for nn --- R/fastNN_binary_func.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index ecf56ee..bea8b0f 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -17,6 +17,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. sorted_dta <- as.data.table(dta@data) + + print(colnames(sorted_dta)) sorted_dta <- sorted_dta[order(PSM_trtProb)][, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] From f7ea0e64eb37707eda8ac73358f9d7ee87d009d4 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 09:22:11 -0500 Subject: [PATCH 163/212] data table test for nn --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index bea8b0f..8f28851 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -18,8 +18,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta <- as.data.table(dta@data) - print(colnames(sorted_dta)) sorted_dta <- sorted_dta[order(PSM_trtProb)][, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] + print(colnames(sorted_dta)) #Conduct the matching From 8c15100195bbf9f8982e181ad95d6a2468ab8147 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 09:24:13 -0500 Subject: [PATCH 164/212] data table test for nn --- R/fastNN_binary_func.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 8f28851..0d70ab0 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -18,10 +18,13 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta <- as.data.table(dta@data) - sorted_dta <- sorted_dta[order(PSM_trtProb)][, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] + sorted_dta <- sorted_dta[order(PSM_trtProb)] print(colnames(sorted_dta)) + sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] + + #Conduct the matching treated <- sorted_dta[get(trtMntVar) == 1] untreated <- sorted_dta[get(trtMntVar) == 0] From 642344d70c2be0da4618dc54a8bfe183f7fff33d Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 09:28:22 -0500 Subject: [PATCH 165/212] data table test for nn --- R/SAT.R | 2 +- R/fastNN_binary_func.R | 388 ++++++++++++++++++++--------------------- 2 files changed, 195 insertions(+), 195 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 1f6d06c..96bf259 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -343,7 +343,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if (mtd == "fastNN") { # *** # this is the slow part of functions - temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) } if (mtd == "NN_WithReplacement") { diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 0d70ab0..8d9f5e3 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -1,179 +1,3 @@ -#FastNN -#Algorithm to find a hopefully near-optimal match of pairs -#In a treatment and control group -#Works by first ordering by the propensity score matching value, -#and then working through this list in order from highest to lowest. -#Matches are removed each step. - -fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - - print("nn1.0") - # timerx <- proc.time() - - - - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - - sorted_dta <- as.data.table(dta@data) - - sorted_dta <- sorted_dta[order(PSM_trtProb)] - print(colnames(sorted_dta)) - - - sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] - - - #Conduct the matching - treated <- sorted_dta[get(trtMntVar) == 1] - untreated <- sorted_dta[get(trtMntVar) == 0] - - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - # dta@data[["match"]] <- -999 - # dta@data[["PSM_distance"]] <- -999 - dta@data[["PSM_match_ID"]] <- -999 - - print("nn2") - - #Calculate a distance decay function - #to perturb pairs based on their distances. - for (j in 1:it_cnt) { - # time_list <- c() - - print("nn2.0") - # timer <- proc.time() - - # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - - # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - - # time_list[1] <- round((proc.time() - timer)[3],5) - # print("nn2.1") - # timer <- proc.time() - - #Run the KNN for all neighbors. - # print(length(treated[[1]])) - # summary(treated[["PSM_trtProb"]]) - # print(length(untreated[[1]])) - # summary(untreated[["PSM_trtProb"]]) - - k <- get.knnx(treated[, PSM_trtProb], untreated[, PSM_trtProb], 1) - - # time_list[2] <- round((proc.time() - timer)[3],5) - print("nn2.2") - # timer <- proc.time() - - #Perturb the values based on the distance decay function, if selected. - if (!is.null(dist_PSM)) { - for (mC in 1:length(k[[1]])) { - - print("nn2.2.0") - - #Calculate the Euclidean Distance between pairs - Control_ID = toString(untreated[mC, get(ids)]) - - mT = k[["nn.index"]][mC] - - Treatment_ID = toString(treated[mT, get(ids)]) - - #Find the control x,y location - cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - - - #Find the treatment x,y location - tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - - y_dist = abs(cCoord[1] - cCoord[2]) - x_dist = abs(tCoord[1] - tCoord[2]) - euc_dist = sqrt(y_dist^2 + x_dist^2) - - print("nn2.2.1") - - PSM_score = k[["nn.dist"]][mC] - geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - - print("nn2.2.2") - - - k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - - } - - } - - # time_list[3] <- round((proc.time() - timer)[3],5) - print("nn2.3") - # timer <- proc.time() - - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - - #This will give us the matched index in the "treated" dataset. - best_m_treated = k[["nn.index"]][best_m_control] - - - #Control and Treatment PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) - - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) - - - #Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[best_m_control, get(ids)]) - Treatment_ID = toString(treated[best_m_control, get(ids)]) - - - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j, sep="") - - - - # time_list[4] <- round((proc.time() - timer)[3],5) - print("nn2.4x") - # timer <- proc.time() - - #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - - - # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - - - - # time_list[5] <- round((proc.time() - timer)[3],5) - # timer <- proc.time() - - #Drop the paired match out of the iteration matrix - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - - # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - - treated <- treated[get(ids) != (Treatment_ID)] - untreated <- untreated[get(ids) != (Control_ID)] - - # time_list[6] <- round((proc.time() - timer)[3],5) - # print(paste(time_list)) - - } - - # print((proc.time() - timerx)[3]) - - return(dta) - -} - - - - - # #FastNN # #Algorithm to find a hopefully near-optimal match of pairs # #In a treatment and control group @@ -184,7 +8,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # print("nn1.0") -# timerx <- proc.time() +# # timerx <- proc.time() @@ -192,12 +16,18 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # #but this may not be an issue for many analysis. # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. -# sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] +# sorted_dta <- as.data.table(dta@data) + +# sorted_dta <- sorted_dta[order(PSM_trtProb)] +# print(colnames(sorted_dta)) + + +# sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] # #Conduct the matching -# treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] -# untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] +# treated <- sorted_dta[get(trtMntVar) == 1] +# untreated <- sorted_dta[get(trtMntVar) == 0] # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) # # dta@data[["match"]] <- -999 @@ -229,7 +59,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # # print(length(untreated[[1]])) # # summary(untreated[["PSM_trtProb"]]) -# k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) +# k <- get.knnx(treated[, PSM_trtProb], untreated[, PSM_trtProb], 1) # # time_list[2] <- round((proc.time() - timer)[3],5) # print("nn2.2") @@ -242,11 +72,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # print("nn2.2.0") # #Calculate the Euclidean Distance between pairs -# Control_ID = toString(untreated[[ids]][[mC]]) +# Control_ID = toString(untreated[mC, get(ids)]) # mT = k[["nn.index"]][mC] -# Treatment_ID = toString(treated[[ids]][[mT]]) +# Treatment_ID = toString(treated[mT, get(ids)]) # #Find the control x,y location # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) @@ -285,17 +115,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # best_m_treated = k[["nn.index"]][best_m_control] -# #Control PSM ID -# cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") -# Control_ID = toString(eval(parse(text=cid_txt))) +# #Control and Treatment PSM ID +# # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") +# # Control_ID = toString(eval(parse(text=cid_txt))) -# #Treatment PSM ID -# tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") -# Treatment_ID = toString(eval(parse(text=tid_txt))) +# # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") +# # Treatment_ID = toString(eval(parse(text=tid_txt))) + # #Control PSM ID and Treatment PSM ID -# # Control_ID = toString(untreated[,ids][best_m_control]) -# # Treatment_ID = toString(treated[,ids][best_m_treated]) +# Control_ID = toString(untreated[best_m_control, get(ids)]) +# Treatment_ID = toString(treated[best_m_control, get(ids)]) # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) @@ -326,17 +156,187 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 -# treated <- treated[which(treated[[ids]] != Treatment_ID),] -# untreated <- untreated[which(untreated[[ids]] != Control_ID),] +# treated <- treated[get(ids) != (Treatment_ID)] +# untreated <- untreated[get(ids) != (Control_ID)] # # time_list[6] <- round((proc.time() - timer)[3],5) # # print(paste(time_list)) # } -# print((proc.time() - timerx)[3]) +# # print((proc.time() - timerx)[3]) # return(dta) # } + + + + +#FastNN +#Algorithm to find a hopefully near-optimal match of pairs +#In a treatment and control group +#Works by first ordering by the propensity score matching value, +#and then working through this list in order from highest to lowest. +#Matches are removed each step. + +fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { + + print("nn1.0") + timerx <- proc.time() + + + + #Fast nearest neighbors search - will not arrive at optimum, + #but this may not be an issue for many analysis. + #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + + + #Conduct the matching + treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] + untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] + + it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + # dta@data[["match"]] <- -999 + # dta@data[["PSM_distance"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 + + print("nn2") + + #Calculate a distance decay function + #to perturb pairs based on their distances. + for (j in 1:it_cnt) { + # time_list <- c() + + print("nn2.0") + # timer <- proc.time() + + # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + + # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + + # time_list[1] <- round((proc.time() - timer)[3],5) + # print("nn2.1") + # timer <- proc.time() + + #Run the KNN for all neighbors. + # print(length(treated[[1]])) + # summary(treated[["PSM_trtProb"]]) + # print(length(untreated[[1]])) + # summary(untreated[["PSM_trtProb"]]) + + k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + + # time_list[2] <- round((proc.time() - timer)[3],5) + print("nn2.2") + # timer <- proc.time() + + #Perturb the values based on the distance decay function, if selected. + if (!is.null(dist_PSM)) { + for (mC in 1:length(k[[1]])) { + + print("nn2.2.0") + + #Calculate the Euclidean Distance between pairs + Control_ID = toString(untreated[[ids]][[mC]]) + + mT = k[["nn.index"]][mC] + + Treatment_ID = toString(treated[[ids]][[mT]]) + + #Find the control x,y location + cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + + + #Find the treatment x,y location + tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + + y_dist = abs(cCoord[1] - cCoord[2]) + x_dist = abs(tCoord[1] - tCoord[2]) + euc_dist = sqrt(y_dist^2 + x_dist^2) + + print("nn2.2.1") + + PSM_score = k[["nn.dist"]][mC] + geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + + print("nn2.2.2") + + + k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + + } + + } + + # time_list[3] <- round((proc.time() - timer)[3],5) + print("nn2.3") + # timer <- proc.time() + + #Add the matched treatment and control values to the recording data frame + #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + + #This will give us the matched index in the "treated" dataset. + best_m_treated = k[["nn.index"]][best_m_control] + + + #Control PSM ID + cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + Control_ID = toString(eval(parse(text=cid_txt))) + + #Treatment PSM ID + tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + Treatment_ID = toString(eval(parse(text=tid_txt))) + + #Control PSM ID and Treatment PSM ID + # Control_ID = toString(untreated[,ids][best_m_control]) + # Treatment_ID = toString(treated[,ids][best_m_treated]) + + + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + pair_id = paste(curgrp,j, sep="") + + + + # time_list[4] <- round((proc.time() - timer)[3],5) + print("nn2.4x") + # timer <- proc.time() + + #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + + + # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + + + + # time_list[5] <- round((proc.time() - timer)[3],5) + # timer <- proc.time() + + #Drop the paired match out of the iteration matrix + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + + # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + + treated <- treated[which(treated[[ids]] != Treatment_ID),] + untreated <- untreated[which(untreated[[ids]] != Control_ID),] + + # time_list[6] <- round((proc.time() - timer)[3],5) + # print(paste(time_list)) + + } + + print((proc.time() - timerx)[3]) + + return(dta) + +} + From 2ef7491c722bd2408eecc0136b5a8a415823c26f Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 11:44:40 -0500 Subject: [PATCH 166/212] data table test for nn --- R/fastNN_binary_func.R | 388 ++++++++++++++++++++--------------------- 1 file changed, 194 insertions(+), 194 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 8d9f5e3..ac94827 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -1,3 +1,179 @@ +#FastNN +#Algorithm to find a hopefully near-optimal match of pairs +#In a treatment and control group +#Works by first ordering by the propensity score matching value, +#and then working through this list in order from highest to lowest. +#Matches are removed each step. + +fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { + + print("nn1.0") + # timerx <- proc.time() + + vvv <<- sorted_dta + + #Fast nearest neighbors search - will not arrive at optimum, + #but this may not be an issue for many analysis. + #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + + sorted_dta <- as.data.table(dta@data) + + sorted_dta <- sorted_dta[order(PSM_trtProb)] + print(colnames(sorted_dta)) + + + sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] + + + #Conduct the matching + treated <- sorted_dta[get(trtMntVar) == 1] + untreated <- sorted_dta[get(trtMntVar) == 0] + + it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + # dta@data[["match"]] <- -999 + # dta@data[["PSM_distance"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 + + print("nn2") + + #Calculate a distance decay function + #to perturb pairs based on their distances. + for (j in 1:it_cnt) { + # time_list <- c() + + print("nn2.0") + # timer <- proc.time() + + # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + + # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + + # time_list[1] <- round((proc.time() - timer)[3],5) + # print("nn2.1") + # timer <- proc.time() + + #Run the KNN for all neighbors. + # print(length(treated[[1]])) + # summary(treated[["PSM_trtProb"]]) + # print(length(untreated[[1]])) + # summary(untreated[["PSM_trtProb"]]) + + k <- get.knnx(treated[, PSM_trtProb], untreated[, PSM_trtProb], 1) + + # time_list[2] <- round((proc.time() - timer)[3],5) + print("nn2.2") + # timer <- proc.time() + + #Perturb the values based on the distance decay function, if selected. + if (!is.null(dist_PSM)) { + for (mC in 1:length(k[[1]])) { + + print("nn2.2.0") + + #Calculate the Euclidean Distance between pairs + Control_ID = toString(untreated[mC, get(ids)]) + + mT = k[["nn.index"]][mC] + + Treatment_ID = toString(treated[mT, get(ids)]) + + #Find the control x,y location + cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + + + #Find the treatment x,y location + tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + + y_dist = abs(cCoord[1] - cCoord[2]) + x_dist = abs(tCoord[1] - tCoord[2]) + euc_dist = sqrt(y_dist^2 + x_dist^2) + + print("nn2.2.1") + + PSM_score = k[["nn.dist"]][mC] + geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + + print("nn2.2.2") + + + k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + + } + + } + + # time_list[3] <- round((proc.time() - timer)[3],5) + print("nn2.3") + # timer <- proc.time() + + #Add the matched treatment and control values to the recording data frame + #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + + #This will give us the matched index in the "treated" dataset. + best_m_treated = k[["nn.index"]][best_m_control] + + + #Control and Treatment PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) + + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) + + + #Control PSM ID and Treatment PSM ID + Control_ID = toString(untreated[best_m_control, get(ids)]) + Treatment_ID = toString(treated[best_m_control, get(ids)]) + + + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + pair_id = paste(curgrp,j, sep="") + + + + # time_list[4] <- round((proc.time() - timer)[3],5) + print("nn2.4x") + # timer <- proc.time() + + #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + + + # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + + + + # time_list[5] <- round((proc.time() - timer)[3],5) + # timer <- proc.time() + + #Drop the paired match out of the iteration matrix + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + + # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + + treated <- treated[get(ids) != (Treatment_ID)] + untreated <- untreated[get(ids) != (Control_ID)] + + # time_list[6] <- round((proc.time() - timer)[3],5) + # print(paste(time_list)) + + } + + # print((proc.time() - timerx)[3]) + + return(dta) + +} + + + + + # #FastNN # #Algorithm to find a hopefully near-optimal match of pairs # #In a treatment and control group @@ -8,7 +184,7 @@ # fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # print("nn1.0") -# # timerx <- proc.time() +# timerx <- proc.time() @@ -16,18 +192,12 @@ # #but this may not be an issue for many analysis. # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. -# sorted_dta <- as.data.table(dta@data) - -# sorted_dta <- sorted_dta[order(PSM_trtProb)] -# print(colnames(sorted_dta)) - - -# sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] +# sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] # #Conduct the matching -# treated <- sorted_dta[get(trtMntVar) == 1] -# untreated <- sorted_dta[get(trtMntVar) == 0] +# treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] +# untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) # # dta@data[["match"]] <- -999 @@ -59,7 +229,7 @@ # # print(length(untreated[[1]])) # # summary(untreated[["PSM_trtProb"]]) -# k <- get.knnx(treated[, PSM_trtProb], untreated[, PSM_trtProb], 1) +# k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) # # time_list[2] <- round((proc.time() - timer)[3],5) # print("nn2.2") @@ -72,11 +242,11 @@ # print("nn2.2.0") # #Calculate the Euclidean Distance between pairs -# Control_ID = toString(untreated[mC, get(ids)]) +# Control_ID = toString(untreated[[ids]][[mC]]) # mT = k[["nn.index"]][mC] -# Treatment_ID = toString(treated[mT, get(ids)]) +# Treatment_ID = toString(treated[[ids]][[mT]]) # #Find the control x,y location # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) @@ -115,17 +285,17 @@ # best_m_treated = k[["nn.index"]][best_m_control] -# #Control and Treatment PSM ID -# # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") -# # Control_ID = toString(eval(parse(text=cid_txt))) +# #Control PSM ID +# cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") +# Control_ID = toString(eval(parse(text=cid_txt))) -# # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") -# # Treatment_ID = toString(eval(parse(text=tid_txt))) - +# #Treatment PSM ID +# tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") +# Treatment_ID = toString(eval(parse(text=tid_txt))) # #Control PSM ID and Treatment PSM ID -# Control_ID = toString(untreated[best_m_control, get(ids)]) -# Treatment_ID = toString(treated[best_m_control, get(ids)]) +# # Control_ID = toString(untreated[,ids][best_m_control]) +# # Treatment_ID = toString(treated[,ids][best_m_treated]) # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) @@ -156,187 +326,17 @@ # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 -# treated <- treated[get(ids) != (Treatment_ID)] -# untreated <- untreated[get(ids) != (Control_ID)] +# treated <- treated[which(treated[[ids]] != Treatment_ID),] +# untreated <- untreated[which(untreated[[ids]] != Control_ID),] # # time_list[6] <- round((proc.time() - timer)[3],5) # # print(paste(time_list)) # } -# # print((proc.time() - timerx)[3]) +# print((proc.time() - timerx)[3]) # return(dta) # } - - - - -#FastNN -#Algorithm to find a hopefully near-optimal match of pairs -#In a treatment and control group -#Works by first ordering by the propensity score matching value, -#and then working through this list in order from highest to lowest. -#Matches are removed each step. - -fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - - print("nn1.0") - timerx <- proc.time() - - - - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - - - #Conduct the matching - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] - untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] - - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - # dta@data[["match"]] <- -999 - # dta@data[["PSM_distance"]] <- -999 - dta@data[["PSM_match_ID"]] <- -999 - - print("nn2") - - #Calculate a distance decay function - #to perturb pairs based on their distances. - for (j in 1:it_cnt) { - # time_list <- c() - - print("nn2.0") - # timer <- proc.time() - - # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - - # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - - # time_list[1] <- round((proc.time() - timer)[3],5) - # print("nn2.1") - # timer <- proc.time() - - #Run the KNN for all neighbors. - # print(length(treated[[1]])) - # summary(treated[["PSM_trtProb"]]) - # print(length(untreated[[1]])) - # summary(untreated[["PSM_trtProb"]]) - - k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - - # time_list[2] <- round((proc.time() - timer)[3],5) - print("nn2.2") - # timer <- proc.time() - - #Perturb the values based on the distance decay function, if selected. - if (!is.null(dist_PSM)) { - for (mC in 1:length(k[[1]])) { - - print("nn2.2.0") - - #Calculate the Euclidean Distance between pairs - Control_ID = toString(untreated[[ids]][[mC]]) - - mT = k[["nn.index"]][mC] - - Treatment_ID = toString(treated[[ids]][[mT]]) - - #Find the control x,y location - cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - - - #Find the treatment x,y location - tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - - y_dist = abs(cCoord[1] - cCoord[2]) - x_dist = abs(tCoord[1] - tCoord[2]) - euc_dist = sqrt(y_dist^2 + x_dist^2) - - print("nn2.2.1") - - PSM_score = k[["nn.dist"]][mC] - geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - - print("nn2.2.2") - - - k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - - } - - } - - # time_list[3] <- round((proc.time() - timer)[3],5) - print("nn2.3") - # timer <- proc.time() - - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - - #This will give us the matched index in the "treated" dataset. - best_m_treated = k[["nn.index"]][best_m_control] - - - #Control PSM ID - cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) - - #Treatment PSM ID - tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) - - #Control PSM ID and Treatment PSM ID - # Control_ID = toString(untreated[,ids][best_m_control]) - # Treatment_ID = toString(treated[,ids][best_m_treated]) - - - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j, sep="") - - - - # time_list[4] <- round((proc.time() - timer)[3],5) - print("nn2.4x") - # timer <- proc.time() - - #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - - - # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - - - - # time_list[5] <- round((proc.time() - timer)[3],5) - # timer <- proc.time() - - #Drop the paired match out of the iteration matrix - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - - # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - - treated <- treated[which(treated[[ids]] != Treatment_ID),] - untreated <- untreated[which(untreated[[ids]] != Control_ID),] - - # time_list[6] <- round((proc.time() - timer)[3],5) - # print(paste(time_list)) - - } - - print((proc.time() - timerx)[3]) - - return(dta) - -} - From c18bff9a0818bbb42fa354c59c1699796a95133a Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 11:46:36 -0500 Subject: [PATCH 167/212] data table test for nn --- R/fastNN_binary_func.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index ac94827..59b53b7 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -10,7 +10,6 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn1.0") # timerx <- proc.time() - vvv <<- sorted_dta #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. @@ -18,6 +17,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta <- as.data.table(dta@data) + vvv <<- sorted_dta + sorted_dta <- sorted_dta[order(PSM_trtProb)] print(colnames(sorted_dta)) From d33a14f15f78f775a133fb52e1e2ddc197c7e531 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 11:48:42 -0500 Subject: [PATCH 168/212] data table test for nn --- R/fastNN_binary_func.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 59b53b7..ed6c0b9 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -17,9 +17,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta <- as.data.table(dta@data) - vvv <<- sorted_dta - sorted_dta <- sorted_dta[order(PSM_trtProb)] + sorted_dta <- sorted_dta[order(get("PSM_trtProb"))] print(colnames(sorted_dta)) From 7346d54732fcc780c966b40542a6a15033a16c47 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 11:50:15 -0500 Subject: [PATCH 169/212] data table test for nn --- R/fastNN_binary_func.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index ed6c0b9..f1896a6 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -17,12 +17,15 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta <- as.data.table(dta@data) + print("x") + print(colnames(sorted_dta)) + sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] + print(colnames(sorted_dta)) sorted_dta <- sorted_dta[order(get("PSM_trtProb"))] print(colnames(sorted_dta)) - sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] #Conduct the matching From efb3780ca8142b84fd4e7df121f6936d4d405f63 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 11:54:45 -0500 Subject: [PATCH 170/212] data table test for nn --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index f1896a6..4f97839 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -22,7 +22,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] print(colnames(sorted_dta)) - sorted_dta <- sorted_dta[order(get("PSM_trtProb"))] + sorted_dta <- sorted_dta[order(sorted_dta$PSM_trtProb)] print(colnames(sorted_dta)) From d546f8cbeb56fa502a8c291fc5c7e859ba2eb7fb Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 11:58:56 -0500 Subject: [PATCH 171/212] data table test for nn --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 4f97839..8421a80 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -22,7 +22,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] print(colnames(sorted_dta)) - sorted_dta <- sorted_dta[order(sorted_dta$PSM_trtProb)] + sorted_dta <- sorted_dta[order(get(ids))] print(colnames(sorted_dta)) From 3ff419110e55ed8b159488a4d0acac7b34df6bba Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:37:27 -0500 Subject: [PATCH 172/212] data table test for nn --- R/fastNN_binary_func.R | 92 ++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 61 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 8421a80..57ca7b5 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -8,73 +8,45 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn1.0") - # timerx <- proc.time() #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- as.data.table(dta@data) - - print("x") - print(colnames(sorted_dta)) - sorted_dta <- sorted_dta[, c(ids, trtMntVar, "PSM_trtProb"), with=FALSE] - print(colnames(sorted_dta)) - - sorted_dta <- sorted_dta[order(get(ids))] - print(colnames(sorted_dta)) + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + #Conduct the matching + treated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")]) + untreated <- as.data.tablesorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")]) - #Conduct the matching - treated <- sorted_dta[get(trtMntVar) == 1] - untreated <- sorted_dta[get(trtMntVar) == 0] - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + it_cnt = min(nrow(treated), nrow(untreated)) # dta@data[["match"]] <- -999 # dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 print("nn2") - #Calculate a distance decay function - #to perturb pairs based on their distances. + # Calculate a distance decay function + # to perturb pairs based on their distances. for (j in 1:it_cnt) { - # time_list <- c() - - print("nn2.0") - # timer <- proc.time() - - # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - - # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - - # time_list[1] <- round((proc.time() - timer)[3],5) - # print("nn2.1") - # timer <- proc.time() - - #Run the KNN for all neighbors. - # print(length(treated[[1]])) - # summary(treated[["PSM_trtProb"]]) - # print(length(untreated[[1]])) - # summary(untreated[["PSM_trtProb"]]) + print("nn2.1") + k <- get.knnx(treated[, PSM_trtProb], untreated[, PSM_trtProb], 1) - # time_list[2] <- round((proc.time() - timer)[3],5) print("nn2.2") - # timer <- proc.time() - #Perturb the values based on the distance decay function, if selected. + # Perturb the values based on the distance decay function, if selected. if (!is.null(dist_PSM)) { for (mC in 1:length(k[[1]])) { print("nn2.2.0") - #Calculate the Euclidean Distance between pairs + # Calculate the Euclidean Distance between pairs Control_ID = toString(untreated[mC, get(ids)]) mT = k[["nn.index"]][mC] @@ -106,19 +78,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { } - # time_list[3] <- round((proc.time() - timer)[3],5) print("nn2.3") - # timer <- proc.time() - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + # Add the matched treatment and control values to the recording data frame + # best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - #This will give us the matched index in the "treated" dataset. + # This will give us the matched index in the "treated" dataset. best_m_treated = k[["nn.index"]][best_m_control] - #Control and Treatment PSM ID + # Control and Treatment PSM ID # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") # Control_ID = toString(eval(parse(text=cid_txt))) @@ -126,19 +96,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Treatment_ID = toString(eval(parse(text=tid_txt))) - #Control PSM ID and Treatment PSM ID + # Control PSM ID and Treatment PSM ID Control_ID = toString(untreated[best_m_control, get(ids)]) Treatment_ID = toString(treated[best_m_control, get(ids)]) - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + # Create a unique pair ID for each group (will simply append a "1" if only 1 group) pair_id = paste(curgrp,j, sep="") - # time_list[4] <- round((proc.time() - timer)[3],5) print("nn2.4x") - # timer <- proc.time() #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID @@ -150,10 +118,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - # time_list[5] <- round((proc.time() - timer)[3],5) - # timer <- proc.time() - - #Drop the paired match out of the iteration matrix + # Drop the paired match out of the iteration matrix # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] @@ -162,12 +127,9 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { treated <- treated[get(ids) != (Treatment_ID)] untreated <- untreated[get(ids) != (Control_ID)] - # time_list[6] <- round((proc.time() - timer)[3],5) - # print(paste(time_list)) - } - # print((proc.time() - timerx)[3]) + } return(dta) @@ -177,6 +139,14 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { + + + + + + + + # #FastNN # #Algorithm to find a hopefully near-optimal match of pairs # #In a treatment and control group @@ -187,7 +157,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # print("nn1.0") -# timerx <- proc.time() +# # timerx <- proc.time() @@ -199,8 +169,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # #Conduct the matching -# treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1,] -# untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0,] +# treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] +# untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) # # dta@data[["match"]] <- -999 @@ -337,7 +307,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # } -# print((proc.time() - timerx)[3]) +# # print((proc.time() - timerx)[3]) # return(dta) From 6dc5d4bac7b3447167fa37826912dc9e5176d308 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:37:54 -0500 Subject: [PATCH 173/212] data table test for nn --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 57ca7b5..f13d2bf 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -19,7 +19,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #Conduct the matching treated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")]) - untreated <- as.data.tablesorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")]) + untreated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")]) From 7c6ece533c506f41742441808cec47d2b75dffb1 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:43:10 -0500 Subject: [PATCH 174/212] data table test for nn --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index f13d2bf..4eeea35 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -9,7 +9,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn1.0") - + library(data.table) #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. From 921b77e876e5e7afbe951dab5814471f3e49a1e9 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:44:04 -0500 Subject: [PATCH 175/212] data table test for nn --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 4eeea35..599a8ba 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -36,7 +36,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn2.1") - k <- get.knnx(treated[, PSM_trtProb], untreated[, PSM_trtProb], 1) + k <- get.knnx(treated[, 'PSM_trtProb'], untreated[, 'PSM_trtProb'], 1) print("nn2.2") From 2b33b8dee08bd504fd6541d1b29cc386a52e7dfb Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:46:23 -0500 Subject: [PATCH 176/212] data table test for nn --- R/fastNN_binary_func.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 599a8ba..c077d82 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -9,7 +9,6 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn1.0") - library(data.table) #Fast nearest neighbors search - will not arrive at optimum, #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. @@ -47,11 +46,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print("nn2.2.0") # Calculate the Euclidean Distance between pairs - Control_ID = toString(untreated[mC, get(ids)]) + Control_ID = toString(untreated[mC, get(ids), with=FALSE]) mT = k[["nn.index"]][mC] - Treatment_ID = toString(treated[mT, get(ids)]) + Treatment_ID = toString(treated[mT, get(ids), with=FALSE]) #Find the control x,y location cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) From bc96f18743ef50d1d17ae0158fc8ad03a3725f2b Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:47:36 -0500 Subject: [PATCH 177/212] data table test for nn --- R/fastNN_binary_func.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index c077d82..2c5cf92 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -96,8 +96,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[best_m_control, get(ids)]) - Treatment_ID = toString(treated[best_m_control, get(ids)]) + Control_ID = toString(untreated[best_m_control, get(ids), with=FALSE]) + Treatment_ID = toString(treated[best_m_control, get(ids), with=FALSE]) # Create a unique pair ID for each group (will simply append a "1" if only 1 group) From dcde29724dd7eea42f9fd4a99e983ff50af5781d Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:48:26 -0500 Subject: [PATCH 178/212] data table test for nn --- R/fastNN_binary_func.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 2c5cf92..dcf4d9a 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -96,8 +96,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[best_m_control, get(ids), with=FALSE]) - Treatment_ID = toString(treated[best_m_control, get(ids), with=FALSE]) + Control_ID = toString(untreated[best_m_control, (ids), with=FALSE]) + Treatment_ID = toString(treated[best_m_control, (ids), with=FALSE]) # Create a unique pair ID for each group (will simply append a "1" if only 1 group) From 55d475395c5e8b25b1748634aa91a2785ce5f7fd Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:50:10 -0500 Subject: [PATCH 179/212] data table test for nn --- R/fastNN_binary_func.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index dcf4d9a..3d9c2dc 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -94,7 +94,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") # Treatment_ID = toString(eval(parse(text=tid_txt))) - + print(class(untreated)) + print(colnames(untreated)) # Control PSM ID and Treatment PSM ID Control_ID = toString(untreated[best_m_control, (ids), with=FALSE]) Treatment_ID = toString(treated[best_m_control, (ids), with=FALSE]) From 3f04e6d5bd6faf0689d45811462e7828ef67e81d Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 12:54:23 -0500 Subject: [PATCH 180/212] data table test for nn --- R/fastNN_binary_func.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 3d9c2dc..7b51489 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -96,6 +96,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { print(class(untreated)) print(colnames(untreated)) + # Control PSM ID and Treatment PSM ID Control_ID = toString(untreated[best_m_control, (ids), with=FALSE]) Treatment_ID = toString(treated[best_m_control, (ids), with=FALSE]) @@ -124,8 +125,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - treated <- treated[get(ids) != (Treatment_ID)] - untreated <- untreated[get(ids) != (Control_ID)] + treated <- treated[get(ids) != (Treatment_ID), with=FALSE] + untreated <- untreated[get(ids) != (Control_ID), with=FALSE] From be8cb34a7baa0d2bedcc6b22723ca73f5fce4dc6 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 13:05:01 -0500 Subject: [PATCH 181/212] data table test for nn --- R/fastNN_binary_func.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 7b51489..080b910 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -125,8 +125,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - treated <- treated[get(ids) != (Treatment_ID), with=FALSE] - untreated <- untreated[get(ids) != (Control_ID), with=FALSE] + qt = quote(ids != Treatment_ID) + qu = quote(ids != Treatment_ID) + + treated <- treated[eval(qt)] + untreated <- untreated[eval(qu)] From c1b56543716d130aa2ca594429a9cb6353c7e25d Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 13:48:49 -0500 Subject: [PATCH 182/212] Cleaning up fastnn and sat. --- R/SAT.R | 128 ++++++++++++++++++++++++----------------- R/fastNN_binary_func.R | 53 +++++++++-------- 2 files changed, 102 insertions(+), 79 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 96bf259..9007333 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -261,6 +261,9 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat1") if (!is.null(constraints) && contraints != c()) { + + print("sat1a.1") + for (cst in 1:length(names(constraints))) { if (names(constraints)[cst] == "groups") { dta@data[,"ConstraintGroupSet_Opt"] <- dta@data[,constraints["groups"]] @@ -275,83 +278,99 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo dist_PSM=NULL } } - - } else { - dta@data[,"ConstraintGroupSet_Opt"] <- 1 - #max the distance threshold by taking the diagonal of the bounding box. - dist_PSM = NULL - } - - - print("sat2") - #Caclulate the number of groups to constrain by, if any. - group_constraints <- unique(dta$ConstraintGroupSet_Opt) - - #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) - t_dta <- list() - u_dta <-list() - grp_list <- list() - cnt = 0 + print("sat1a.2") - for (grp in 1:length(group_constraints)) { - cur_grp <- as.matrix(group_constraints)[grp] - grp_index = length(grp_list)+1 - t_index = length(t_dta)+1 - grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] + # Caclulate the number of groups to constrain by, if any. + group_constraints <- unique(dta$ConstraintGroupSet_Opt) + + # Make sure there are both treatment and control groups of an adequate size (>= 1 of each) + t_dta <- list() + u_dta <-list() + grp_list <- list() + cnt = 0 + + for (grp in 1:length(group_constraints)) { + cur_grp <- as.matrix(group_constraints)[grp] + grp_index = length(grp_list)+1 + t_index = length(t_dta)+1 + grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] + + t_dta[[t_index]] <- dta[dta$TrtBin == 1,] + u_dta[[t_index]] <- dta[dta$TrtBin == 0,] + + treatment_count <- cur_grp %in% t_dta[[t_index]]$ConstraintGroupSet_Opt + untreated_count <- cur_grp %in% u_dta[[t_index]]$ConstraintGroupSet_Opt + + if ((untreated_count == FALSE) || (treatment_count == FALSE)) { + dta <- dta[!dta$ConstraintGroupSet_Opt == cur_grp,] + t_dta[[t_index]] <- NULL + u_dta[[t_index]] <- NULL + grp_list[[t_index]] <- NULL + war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") + warning(war_statement) + + } else { + t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + + cnt = cnt + 1 + } + } - t_dta[[t_index]] <- dta[dta$TrtBin == 1,] - u_dta[[t_index]] <- dta[dta$TrtBin == 0,] - treatment_count <- cur_grp %in% t_dta[[t_index]]$ConstraintGroupSet_Opt - untreated_count <- cur_grp %in% u_dta[[t_index]]$ConstraintGroupSet_Opt + if (cnt == 0) { + return('drop') + } - if ((untreated_count == FALSE) || (treatment_count == FALSE)) { - dta <- dta[!dta$ConstraintGroupSet_Opt == cur_grp,] - t_dta[[t_index]] <- NULL - u_dta[[t_index]] <- NULL - grp_list[[t_index]] <- NULL - war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") - warning(war_statement) - } else { - t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] - u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + print("sat1a.3") - cnt = cnt + 1 - } - } + for (i in 1:cnt) { + cur_grp <- grp_list[[i]] - if (cnt == 0) { - return('drop') - } + print("sat3.1") + it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) + print("sat3.2") + if (mtd == "fastNN") { + # *** + # this is the slow part of functions + temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) + } - print("sat3") + # if (mtd == "NN_WithReplacement") { + # print("NN with replacement is currently not available, please choose fastNN") + # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + # } + } - temp_dta <- list() + + } else { - for (i in 1:cnt) { - cur_grp <- grp_list[[i]] + print("sat1b.1") - print("sat3.1") - it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) + temp_dta <- list() - print("sat3.2") if (mtd == "fastNN") { # *** # this is the slow part of functions - temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) + temp_dta[[i]] <- fastNN_binary_func(dta, TrtBinColName, ids, NULL, NULL) } - if (mtd == "NN_WithReplacement") { - print("NN with replacement is currently not available, please choose fastNN") - # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) - } + # if (mtd == "NN_WithReplacement") { + # print("NN with replacement is currently not available, please choose fastNN") + # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + # } + + } + + + print("sat4") #Build the final datasets from subsets @@ -364,6 +383,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo dta <- temp_dta[[1]] } + print("sat5") if (drop_unmatched == TRUE) { diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 080b910..5308fd9 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -37,45 +37,45 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { k <- get.knnx(treated[, 'PSM_trtProb'], untreated[, 'PSM_trtProb'], 1) - print("nn2.2") + # print("nn2.2") - # Perturb the values based on the distance decay function, if selected. - if (!is.null(dist_PSM)) { - for (mC in 1:length(k[[1]])) { + # # Perturb the values based on the distance decay function, if selected. + # if (!is.null(dist_PSM)) { + # for (mC in 1:length(k[[1]])) { - print("nn2.2.0") + # print("nn2.2.0") - # Calculate the Euclidean Distance between pairs - Control_ID = toString(untreated[mC, get(ids), with=FALSE]) + # # Calculate the Euclidean Distance between pairs + # Control_ID = toString(untreated[mC, get(ids), with=FALSE]) - mT = k[["nn.index"]][mC] + # mT = k[["nn.index"]][mC] - Treatment_ID = toString(treated[mT, get(ids), with=FALSE]) + # Treatment_ID = toString(treated[mT, get(ids), with=FALSE]) - #Find the control x,y location - cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # #Find the control x,y location + # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - #Find the treatment x,y location - tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # #Find the treatment x,y location + # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - y_dist = abs(cCoord[1] - cCoord[2]) - x_dist = abs(tCoord[1] - tCoord[2]) - euc_dist = sqrt(y_dist^2 + x_dist^2) + # y_dist = abs(cCoord[1] - cCoord[2]) + # x_dist = abs(tCoord[1] - tCoord[2]) + # euc_dist = sqrt(y_dist^2 + x_dist^2) - print("nn2.2.1") + # print("nn2.2.1") - PSM_score = k[["nn.dist"]][mC] - geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # PSM_score = k[["nn.dist"]][mC] + # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - print("nn2.2.2") + # print("nn2.2.2") - k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - } + # } - } + # } print("nn2.3") @@ -103,8 +103,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j, sep="") - + if (curgrp != NULL) { + pair_id <- paste(curgrp,j, sep="") + else { + pair_id <- paste('pair',j, sep="") + } print("nn2.4x") From 3aae82fd6cd92125277418629f73b12f724a85e8 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 13:49:17 -0500 Subject: [PATCH 183/212] Cleaning up fastnn and sat. --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 5308fd9..ad83aaf 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -105,7 +105,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Create a unique pair ID for each group (will simply append a "1" if only 1 group) if (curgrp != NULL) { pair_id <- paste(curgrp,j, sep="") - else { + } else { pair_id <- paste('pair',j, sep="") } From 975c502c27d9e38fc486bb8d2185179e00a853c6 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 13:50:47 -0500 Subject: [PATCH 184/212] Cleaning up fastnn and sat. --- R/fastNN_binary_func.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index ad83aaf..fea89b8 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -103,10 +103,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Create a unique pair ID for each group (will simply append a "1" if only 1 group) - if (curgrp != NULL) { - pair_id <- paste(curgrp,j, sep="") - } else { + if (curgrp == NULL) { pair_id <- paste('pair',j, sep="") + } else { + pair_id <- paste(curgrp,j, sep="") + } From 9fa1103a5539e4e649ba34eb276333c4cc0c25ce Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 13:51:40 -0500 Subject: [PATCH 185/212] Cleaning up fastnn and sat. --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index fea89b8..a2fe169 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -103,7 +103,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Create a unique pair ID for each group (will simply append a "1" if only 1 group) - if (curgrp == NULL) { + if (is.null(curgrp)) { pair_id <- paste('pair',j, sep="") } else { pair_id <- paste(curgrp,j, sep="") From 83bb0a01d909ebddcec1549af4c6540ce2cb1a82 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 13:55:30 -0500 Subject: [PATCH 186/212] Cleaning up fastnn and sat. --- R/SAT.R | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 9007333..5cdad3b 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -289,7 +289,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo t_dta <- list() u_dta <-list() grp_list <- list() - cnt = 0 + cnt <- 0 for (grp in 1:length(group_constraints)) { cur_grp <- as.matrix(group_constraints)[grp] @@ -315,7 +315,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] - cnt = cnt + 1 + cnt <- cnt + 1 } } @@ -331,10 +331,10 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo for (i in 1:cnt) { cur_grp <- grp_list[[i]] - print("sat3.1") + print("sat1a.3.1") it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) - print("sat3.2") + print("sat1a.3.2") if (mtd == "fastNN") { # *** # this is the slow part of functions @@ -347,19 +347,33 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # } } + print("sat1a.4") + + #Build the final datasets from subsets + if (cnt > 1) { + dta <- temp_dta[[1]] + for(k in 2:cnt) { + dta <- maptools::spRbind(dta, temp_dta[[k]]) + } + } else { + dta <- temp_dta[[1]] + } + } else { print("sat1b.1") + cnt <- 1 temp_dta <- list() if (mtd == "fastNN") { # *** # this is the slow part of functions - temp_dta[[i]] <- fastNN_binary_func(dta, TrtBinColName, ids, NULL, NULL) + dta <- fastNN_binary_func(dta, TrtBinColName, ids, NULL, NULL) } + # if (mtd == "NN_WithReplacement") { # print("NN with replacement is currently not available, please choose fastNN") # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) @@ -371,19 +385,6 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo - print("sat4") - - #Build the final datasets from subsets - if (cnt > 1) { - dta <- temp_dta[[1]] - for(k in 2:cnt) { - dta <- maptools::spRbind(dta, temp_dta[[k]]) - } - } else { - dta <- temp_dta[[1]] - } - - print("sat5") if (drop_unmatched == TRUE) { From b8f1a1ac8499e3904b33ea2df7b5a8e5ab426894 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 14:03:59 -0500 Subject: [PATCH 187/212] Cleaning up fastnn and sat. --- R/fastNN_binary_func.R | 384 ++++++++++++++++++++--------------------- 1 file changed, 192 insertions(+), 192 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index a2fe169..b16694d 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -1,147 +1,147 @@ -#FastNN -#Algorithm to find a hopefully near-optimal match of pairs -#In a treatment and control group -#Works by first ordering by the propensity score matching value, -#and then working through this list in order from highest to lowest. -#Matches are removed each step. +# #FastNN +# #Algorithm to find a hopefully near-optimal match of pairs +# #In a treatment and control group +# #Works by first ordering by the propensity score matching value, +# #and then working through this list in order from highest to lowest. +# #Matches are removed each step. -fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { +# fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - print("nn1.0") +# print("nn1.0") - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. +# #Fast nearest neighbors search - will not arrive at optimum, +# #but this may not be an issue for many analysis. +# #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] +# sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - #Conduct the matching - treated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")]) - untreated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")]) +# #Conduct the matching +# treated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")]) +# untreated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")]) - it_cnt = min(nrow(treated), nrow(untreated)) - # dta@data[["match"]] <- -999 - # dta@data[["PSM_distance"]] <- -999 - dta@data[["PSM_match_ID"]] <- -999 +# it_cnt = min(nrow(treated), nrow(untreated)) +# # dta@data[["match"]] <- -999 +# # dta@data[["PSM_distance"]] <- -999 +# dta@data[["PSM_match_ID"]] <- -999 - print("nn2") +# print("nn2") - # Calculate a distance decay function - # to perturb pairs based on their distances. - for (j in 1:it_cnt) { +# # Calculate a distance decay function +# # to perturb pairs based on their distances. +# for (j in 1:it_cnt) { - print("nn2.1") +# print("nn2.1") - k <- get.knnx(treated[, 'PSM_trtProb'], untreated[, 'PSM_trtProb'], 1) +# k <- get.knnx(treated[, 'PSM_trtProb'], untreated[, 'PSM_trtProb'], 1) - # print("nn2.2") +# # print("nn2.2") - # # Perturb the values based on the distance decay function, if selected. - # if (!is.null(dist_PSM)) { - # for (mC in 1:length(k[[1]])) { +# # # Perturb the values based on the distance decay function, if selected. +# # if (!is.null(dist_PSM)) { +# # for (mC in 1:length(k[[1]])) { - # print("nn2.2.0") +# # print("nn2.2.0") - # # Calculate the Euclidean Distance between pairs - # Control_ID = toString(untreated[mC, get(ids), with=FALSE]) +# # # Calculate the Euclidean Distance between pairs +# # Control_ID = toString(untreated[mC, get(ids), with=FALSE]) - # mT = k[["nn.index"]][mC] +# # mT = k[["nn.index"]][mC] - # Treatment_ID = toString(treated[mT, get(ids), with=FALSE]) +# # Treatment_ID = toString(treated[mT, get(ids), with=FALSE]) - # #Find the control x,y location - # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) +# # #Find the control x,y location +# # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - # #Find the treatment x,y location - # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) +# # #Find the treatment x,y location +# # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - # y_dist = abs(cCoord[1] - cCoord[2]) - # x_dist = abs(tCoord[1] - tCoord[2]) - # euc_dist = sqrt(y_dist^2 + x_dist^2) +# # y_dist = abs(cCoord[1] - cCoord[2]) +# # x_dist = abs(tCoord[1] - tCoord[2]) +# # euc_dist = sqrt(y_dist^2 + x_dist^2) - # print("nn2.2.1") +# # print("nn2.2.1") - # PSM_score = k[["nn.dist"]][mC] - # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") +# # PSM_score = k[["nn.dist"]][mC] +# # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # print("nn2.2.2") +# # print("nn2.2.2") - # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) +# # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - # } +# # } - # } +# # } - print("nn2.3") +# print("nn2.3") - # Add the matched treatment and control values to the recording data frame - # best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) +# # Add the matched treatment and control values to the recording data frame +# # best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. +# best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - # This will give us the matched index in the "treated" dataset. - best_m_treated = k[["nn.index"]][best_m_control] +# # This will give us the matched index in the "treated" dataset. +# best_m_treated = k[["nn.index"]][best_m_control] - # Control and Treatment PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) +# # Control and Treatment PSM ID +# # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") +# # Control_ID = toString(eval(parse(text=cid_txt))) - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) +# # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") +# # Treatment_ID = toString(eval(parse(text=tid_txt))) - print(class(untreated)) - print(colnames(untreated)) +# print(class(untreated)) +# print(colnames(untreated)) - # Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[best_m_control, (ids), with=FALSE]) - Treatment_ID = toString(treated[best_m_control, (ids), with=FALSE]) +# # Control PSM ID and Treatment PSM ID +# Control_ID = toString(untreated[best_m_control, (ids), with=FALSE]) +# Treatment_ID = toString(treated[best_m_control, (ids), with=FALSE]) - # Create a unique pair ID for each group (will simply append a "1" if only 1 group) - if (is.null(curgrp)) { - pair_id <- paste('pair',j, sep="") - } else { - pair_id <- paste(curgrp,j, sep="") +# # Create a unique pair ID for each group (will simply append a "1" if only 1 group) +# if (is.null(curgrp)) { +# pair_id <- paste('pair',j, sep="") +# } else { +# pair_id <- paste(curgrp,j, sep="") - } +# } - print("nn2.4x") +# print("nn2.4x") - #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID +# #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row +# # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID +# # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id +# # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] +# dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - # Drop the paired match out of the iteration matrix - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] +# # Drop the paired match out of the iteration matrix +# # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] +# # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 +# # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - qt = quote(ids != Treatment_ID) - qu = quote(ids != Treatment_ID) +# qt = quote(ids != Treatment_ID) +# qu = quote(ids != Treatment_ID) - treated <- treated[eval(qt)] - untreated <- untreated[eval(qu)] +# treated <- treated[eval(qt)] +# untreated <- untreated[eval(qu)] - } +# } - return(dta) +# return(dta) -} +# } @@ -155,169 +155,169 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { -# #FastNN -# #Algorithm to find a hopefully near-optimal match of pairs -# #In a treatment and control group -# #Works by first ordering by the propensity score matching value, -# #and then working through this list in order from highest to lowest. -# #Matches are removed each step. +#FastNN +#Algorithm to find a hopefully near-optimal match of pairs +#In a treatment and control group +#Works by first ordering by the propensity score matching value, +#and then working through this list in order from highest to lowest. +#Matches are removed each step. -# fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { +fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { -# print("nn1.0") -# # timerx <- proc.time() + print("nn1.0") + # timerx <- proc.time() -# #Fast nearest neighbors search - will not arrive at optimum, -# #but this may not be an issue for many analysis. -# #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + #Fast nearest neighbors search - will not arrive at optimum, + #but this may not be an issue for many analysis. + #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. -# sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] -# #Conduct the matching -# treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] -# untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + #Conduct the matching + treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] -# it_cnt = min(length(treated[[1]]), length(untreated[[1]])) -# # dta@data[["match"]] <- -999 -# # dta@data[["PSM_distance"]] <- -999 -# dta@data[["PSM_match_ID"]] <- -999 + it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + # dta@data[["match"]] <- -999 + # dta@data[["PSM_distance"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 -# print("nn2") + # print("nn2") -# #Calculate a distance decay function -# #to perturb pairs based on their distances. -# for (j in 1:it_cnt) { -# # time_list <- c() + #Calculate a distance decay function + #to perturb pairs based on their distances. + for (j in 1:it_cnt) { + # time_list <- c() -# print("nn2.0") -# # timer <- proc.time() + print("nn2.0") + # timer <- proc.time() -# # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] -# # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] -# # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) -# # time_list[1] <- round((proc.time() - timer)[3],5) -# # print("nn2.1") -# # timer <- proc.time() + # time_list[1] <- round((proc.time() - timer)[3],5) + # print("nn2.1") + # timer <- proc.time() -# #Run the KNN for all neighbors. -# # print(length(treated[[1]])) -# # summary(treated[["PSM_trtProb"]]) -# # print(length(untreated[[1]])) -# # summary(untreated[["PSM_trtProb"]]) + #Run the KNN for all neighbors. + # print(length(treated[[1]])) + # summary(treated[["PSM_trtProb"]]) + # print(length(untreated[[1]])) + # summary(untreated[["PSM_trtProb"]]) -# k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) -# # time_list[2] <- round((proc.time() - timer)[3],5) -# print("nn2.2") -# # timer <- proc.time() + # time_list[2] <- round((proc.time() - timer)[3],5) + print("nn2.2") + # timer <- proc.time() -# #Perturb the values based on the distance decay function, if selected. -# if (!is.null(dist_PSM)) { -# for (mC in 1:length(k[[1]])) { + # #Perturb the values based on the distance decay function, if selected. + # if (!is.null(dist_PSM)) { + # for (mC in 1:length(k[[1]])) { -# print("nn2.2.0") + # print("nn2.2.0") -# #Calculate the Euclidean Distance between pairs -# Control_ID = toString(untreated[[ids]][[mC]]) + # #Calculate the Euclidean Distance between pairs + # Control_ID = toString(untreated[[ids]][[mC]]) -# mT = k[["nn.index"]][mC] + # mT = k[["nn.index"]][mC] -# Treatment_ID = toString(treated[[ids]][[mT]]) + # Treatment_ID = toString(treated[[ids]][[mT]]) -# #Find the control x,y location -# cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # #Find the control x,y location + # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) -# #Find the treatment x,y location -# tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # #Find the treatment x,y location + # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) -# y_dist = abs(cCoord[1] - cCoord[2]) -# x_dist = abs(tCoord[1] - tCoord[2]) -# euc_dist = sqrt(y_dist^2 + x_dist^2) + # y_dist = abs(cCoord[1] - cCoord[2]) + # x_dist = abs(tCoord[1] - tCoord[2]) + # euc_dist = sqrt(y_dist^2 + x_dist^2) -# print("nn2.2.1") + # print("nn2.2.1") -# PSM_score = k[["nn.dist"]][mC] -# geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # PSM_score = k[["nn.dist"]][mC] + # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") -# print("nn2.2.2") + # print("nn2.2.2") -# k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) -# } + # } -# } + # } -# # time_list[3] <- round((proc.time() - timer)[3],5) -# print("nn2.3") -# # timer <- proc.time() + # time_list[3] <- round((proc.time() - timer)[3],5) + print("nn2.3") + # timer <- proc.time() -# #Add the matched treatment and control values to the recording data frame -# #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. -# best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + #Add the matched treatment and control values to the recording data frame + #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) -# #This will give us the matched index in the "treated" dataset. -# best_m_treated = k[["nn.index"]][best_m_control] + #This will give us the matched index in the "treated" dataset. + best_m_treated = k[["nn.index"]][best_m_control] -# #Control PSM ID -# cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") -# Control_ID = toString(eval(parse(text=cid_txt))) + #Control PSM ID + cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + Control_ID = toString(eval(parse(text=cid_txt))) -# #Treatment PSM ID -# tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") -# Treatment_ID = toString(eval(parse(text=tid_txt))) + #Treatment PSM ID + tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + Treatment_ID = toString(eval(parse(text=tid_txt))) -# #Control PSM ID and Treatment PSM ID -# # Control_ID = toString(untreated[,ids][best_m_control]) -# # Treatment_ID = toString(treated[,ids][best_m_treated]) + #Control PSM ID and Treatment PSM ID + # Control_ID = toString(untreated[,ids][best_m_control]) + # Treatment_ID = toString(treated[,ids][best_m_treated]) -# #Create a unique pair ID for each group (will simply append a "1" if only 1 group) -# pair_id = paste(curgrp,j, sep="") + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + pair_id = paste(curgrp,j, sep="") -# # time_list[4] <- round((proc.time() - timer)[3],5) -# print("nn2.4x") -# # timer <- proc.time() + # time_list[4] <- round((proc.time() - timer)[3],5) + print("nn2.4x") + # timer <- proc.time() -# #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row -# # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID -# # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID -# # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] -# dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id -# # time_list[5] <- round((proc.time() - timer)[3],5) -# # timer <- proc.time() + # time_list[5] <- round((proc.time() - timer)[3],5) + # timer <- proc.time() -# #Drop the paired match out of the iteration matrix -# # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] -# # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + #Drop the paired match out of the iteration matrix + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] -# # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 -# treated <- treated[which(treated[[ids]] != Treatment_ID),] -# untreated <- untreated[which(untreated[[ids]] != Control_ID),] + treated <- treated[which(treated[[ids]] != Treatment_ID),] + untreated <- untreated[which(untreated[[ids]] != Control_ID),] -# # time_list[6] <- round((proc.time() - timer)[3],5) -# # print(paste(time_list)) + # time_list[6] <- round((proc.time() - timer)[3],5) + # print(paste(time_list)) -# } + } -# # print((proc.time() - timerx)[3]) + # print((proc.time() - timerx)[3]) -# return(dta) + return(dta) -# } +} From fecb05258a6895fe1782ca0f22e797e9f04a367b Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 14:06:43 -0500 Subject: [PATCH 188/212] Cleaning up fastnn and sat. --- R/fastNN_binary_func.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index b16694d..b12f158 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -192,7 +192,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { for (j in 1:it_cnt) { # time_list <- c() - print("nn2.0") + print(paste("nn cnt:",i)) # timer <- proc.time() # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] @@ -213,7 +213,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) # time_list[2] <- round((proc.time() - timer)[3],5) - print("nn2.2") + # print("nn2.2") # timer <- proc.time() # #Perturb the values based on the distance decay function, if selected. From c3f1859ab289f089a52fe865a6194fd14dc39605 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 14:07:41 -0500 Subject: [PATCH 189/212] Cleaning up fastnn and sat. --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index b12f158..b3812d6 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -192,7 +192,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { for (j in 1:it_cnt) { # time_list <- c() - print(paste("nn cnt:",i)) + print(paste("nn cnt:",j)) # timer <- proc.time() # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] From 50c67c143b2fda5bd4a28443f9115ad7e391ccaf Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 14:30:27 -0500 Subject: [PATCH 190/212] Cleaning up fastnn and sat. --- R/SAT.R | 4 ++++ R/fastNN_binary_func.R | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/R/SAT.R b/R/SAT.R index 5cdad3b..ac3f97c 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -371,6 +371,10 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # *** # this is the slow part of functions dta <- fastNN_binary_func(dta, TrtBinColName, ids, NULL, NULL) + + if (class(dta) == class('drop')) { + return('drop') + } } diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index b3812d6..b65eb3f 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -181,6 +181,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + + if (it_cnt < 30) { + return('drop') + } + # dta@data[["match"]] <- -999 # dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 From 866f21d9ed736ac0df1628577e399d8b1a9d8710 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 16:20:32 -0500 Subject: [PATCH 191/212] Cleaning up fastnn and sat. --- R/fastNN_binary_func.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index b65eb3f..01cbb30 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -185,7 +185,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { if (it_cnt < 30) { return('drop') } - + # dta@data[["match"]] <- -999 # dta@data[["PSM_distance"]] <- -999 dta@data[["PSM_match_ID"]] <- -999 @@ -271,17 +271,17 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { best_m_treated = k[["nn.index"]][best_m_control] - #Control PSM ID - cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) + # #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) - #Treatment PSM ID - tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) + # #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) #Control PSM ID and Treatment PSM ID - # Control_ID = toString(untreated[,ids][best_m_control]) - # Treatment_ID = toString(treated[,ids][best_m_treated]) + Control_ID = toString(untreated[[ids][best_m_control]) + Treatment_ID = toString(treated[[ids]][best_m_treated]) #Create a unique pair ID for each group (will simply append a "1" if only 1 group) From c1015ba3559739613d01451eeef9378b7a1af822 Mon Sep 17 00:00:00 2001 From: userz Date: Tue, 10 Nov 2015 16:21:03 -0500 Subject: [PATCH 192/212] Cleaning up fastnn and sat. --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 01cbb30..0f80e36 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -280,7 +280,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # Treatment_ID = toString(eval(parse(text=tid_txt))) #Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[[ids][best_m_control]) + Control_ID = toString(untreated[[ids]][best_m_control]) Treatment_ID = toString(treated[[ids]][best_m_treated]) From a0f9eca05043c2a4a8d032e4751a4dc9c1a41382 Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:12:28 -0500 Subject: [PATCH 193/212] Test matchit lib. --- R/fastNN_binary_func.R | 222 +++++++++++++++++++++++------------------ R/loadLibs.R | 4 +- 2 files changed, 126 insertions(+), 100 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 0f80e36..50b83d5 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -164,163 +164,187 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - print("nn1.0") - # timerx <- proc.time() - + print('nn') - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + # make sure there are at least x rows in both treated and control + row_min <- 30 + trt_rows <- dta@data$TrtBin == 1 + if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { + return('drop') + } - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + rownames(dta@data) <- dta$sector_split_id + m <- matchit(TrtBin ~ PSM_trtProb, data=dta@data, method="optimal", ratio=1) - #Conduct the matching - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] - untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + for ( i in rownames(m$match.matrix) ) { + trt_id <- i + cnt_id <- m$match.matrix[i,] + pair_id <- paste(trt_id,cnt_id, sep='__') - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id - if (it_cnt < 30) { - return('drop') } - # dta@data[["match"]] <- -999 - # dta@data[["PSM_distance"]] <- -999 - dta@data[["PSM_match_ID"]] <- -999 - # print("nn2") + # =========================================================================== + + + # print("nn1.0") + + + # #Fast nearest neighbors search - will not arrive at optimum, + # #but this may not be an issue for many analysis. + # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + + # sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - #Calculate a distance decay function - #to perturb pairs based on their distances. - for (j in 1:it_cnt) { - # time_list <- c() - print(paste("nn cnt:",j)) - # timer <- proc.time() + # #Conduct the matching + # treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + # untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] - # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + + # if (it_cnt < 30) { + # return('drop') + # } + + # # dta@data[["match"]] <- -999 + # # dta@data[["PSM_distance"]] <- -999 + # dta@data[["PSM_match_ID"]] <- -999 + + # # print("nn2") + + # #Calculate a distance decay function + # #to perturb pairs based on their distances. + # for (j in 1:it_cnt) { + # # time_list <- c() + + # print(paste("nn cnt:",j)) + # # timer <- proc.time() + + # # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - # time_list[1] <- round((proc.time() - timer)[3],5) - # print("nn2.1") - # timer <- proc.time() + # # time_list[1] <- round((proc.time() - timer)[3],5) + # # print("nn2.1") + # # timer <- proc.time() - #Run the KNN for all neighbors. - # print(length(treated[[1]])) - # summary(treated[["PSM_trtProb"]]) - # print(length(untreated[[1]])) - # summary(untreated[["PSM_trtProb"]]) + # #Run the KNN for all neighbors. + # # print(length(treated[[1]])) + # # summary(treated[["PSM_trtProb"]]) + # # print(length(untreated[[1]])) + # # summary(untreated[["PSM_trtProb"]]) - k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + # k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - # time_list[2] <- round((proc.time() - timer)[3],5) - # print("nn2.2") - # timer <- proc.time() + # # time_list[2] <- round((proc.time() - timer)[3],5) + # # print("nn2.2") + # # timer <- proc.time() - # #Perturb the values based on the distance decay function, if selected. - # if (!is.null(dist_PSM)) { - # for (mC in 1:length(k[[1]])) { + # # #Perturb the values based on the distance decay function, if selected. + # # if (!is.null(dist_PSM)) { + # # for (mC in 1:length(k[[1]])) { - # print("nn2.2.0") + # # print("nn2.2.0") - # #Calculate the Euclidean Distance between pairs - # Control_ID = toString(untreated[[ids]][[mC]]) + # # #Calculate the Euclidean Distance between pairs + # # Control_ID = toString(untreated[[ids]][[mC]]) - # mT = k[["nn.index"]][mC] + # # mT = k[["nn.index"]][mC] - # Treatment_ID = toString(treated[[ids]][[mT]]) + # # Treatment_ID = toString(treated[[ids]][[mT]]) - # #Find the control x,y location - # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # # #Find the control x,y location + # # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - # #Find the treatment x,y location - # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # # #Find the treatment x,y location + # # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - # y_dist = abs(cCoord[1] - cCoord[2]) - # x_dist = abs(tCoord[1] - tCoord[2]) - # euc_dist = sqrt(y_dist^2 + x_dist^2) + # # y_dist = abs(cCoord[1] - cCoord[2]) + # # x_dist = abs(tCoord[1] - tCoord[2]) + # # euc_dist = sqrt(y_dist^2 + x_dist^2) - # print("nn2.2.1") + # # print("nn2.2.1") - # PSM_score = k[["nn.dist"]][mC] - # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # # PSM_score = k[["nn.dist"]][mC] + # # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # print("nn2.2.2") + # # print("nn2.2.2") - # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - # } + # # } - # } + # # } - # time_list[3] <- round((proc.time() - timer)[3],5) - print("nn2.3") - # timer <- proc.time() + # # time_list[3] <- round((proc.time() - timer)[3],5) + # print("nn2.3") + # # timer <- proc.time() - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + # #Add the matched treatment and control values to the recording data frame + # #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + # best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - #This will give us the matched index in the "treated" dataset. - best_m_treated = k[["nn.index"]][best_m_control] + # #This will give us the matched index in the "treated" dataset. + # best_m_treated = k[["nn.index"]][best_m_control] - # #Control PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) + # # #Control PSM ID + # # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # # Control_ID = toString(eval(parse(text=cid_txt))) - # #Treatment PSM ID - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) + # # #Treatment PSM ID + # # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # # Treatment_ID = toString(eval(parse(text=tid_txt))) - #Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[[ids]][best_m_control]) - Treatment_ID = toString(treated[[ids]][best_m_treated]) + # #Control PSM ID and Treatment PSM ID + # Control_ID = toString(untreated[[ids]][best_m_control]) + # Treatment_ID = toString(treated[[ids]][best_m_treated]) - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j, sep="") + # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + # pair_id = paste(curgrp,j, sep="") - # time_list[4] <- round((proc.time() - timer)[3],5) - print("nn2.4x") - # timer <- proc.time() + # # time_list[4] <- round((proc.time() - timer)[3],5) + # print("nn2.4x") + # # timer <- proc.time() - #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + # #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + # # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + # dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - # time_list[5] <- round((proc.time() - timer)[3],5) - # timer <- proc.time() + # # time_list[5] <- round((proc.time() - timer)[3],5) + # # timer <- proc.time() - #Drop the paired match out of the iteration matrix - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + # #Drop the paired match out of the iteration matrix + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - treated <- treated[which(treated[[ids]] != Treatment_ID),] - untreated <- untreated[which(untreated[[ids]] != Control_ID),] + # treated <- treated[which(treated[[ids]] != Treatment_ID),] + # untreated <- untreated[which(untreated[[ids]] != Control_ID),] - # time_list[6] <- round((proc.time() - timer)[3],5) - # print(paste(time_list)) + # # time_list[6] <- round((proc.time() - timer)[3],5) + # # print(paste(time_list)) - } + # } - # print((proc.time() - timerx)[3]) return(dta) diff --git a/R/loadLibs.R b/R/loadLibs.R index cff4216..943400e 100644 --- a/R/loadLibs.R +++ b/R/loadLibs.R @@ -20,5 +20,7 @@ loadLibs <- function (x=1) { library(lmtest) library(multiwayvcov) - library(data.table) + # library(data.table) + library(MatchIt) + library(optmatch) } From f6540526621ead71cb29db10b2220c9c46f5765c Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:22:32 -0500 Subject: [PATCH 194/212] Test matchit lib. --- R/fastNN_binary_func.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 50b83d5..f92f74d 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -174,9 +174,15 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { return('drop') } - rownames(dta@data) <- dta$sector_split_id - m <- matchit(TrtBin ~ PSM_trtProb, data=dta@data, method="optimal", ratio=1) + + match_data <- dta@data[,c('PSM_trtProb', ids)] + + rownames(match_data) <- match_data[[ids]] + + zzz <<- match_data + + m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="optimal", ratio=1) for ( i in rownames(m$match.matrix) ) { trt_id <- i From 54c3d72d1159a9c7729208db512859c712ef291e Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:23:45 -0500 Subject: [PATCH 195/212] Test matchit lib. --- R/fastNN_binary_func.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index f92f74d..3a4adcb 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -176,12 +176,12 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - match_data <- dta@data[,c('PSM_trtProb', ids)] + match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] rownames(match_data) <- match_data[[ids]] zzz <<- match_data - + m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="optimal", ratio=1) for ( i in rownames(m$match.matrix) ) { From 80759e750363be6be2025a1127d01f1a4fbefa35 Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:38:21 -0500 Subject: [PATCH 196/212] Test matchit lib. --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 3a4adcb..e563d85 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -182,7 +182,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { zzz <<- match_data - m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="optimal", ratio=1) + m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) for ( i in rownames(m$match.matrix) ) { trt_id <- i From 85b06fe2957f0843f90cfda5ef642d5ce56bd5fb Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:40:54 -0500 Subject: [PATCH 197/212] Test matchit lib. --- R/fastNN_binary_func.R | 2 ++ R/loadLibs.R | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index e563d85..a9b3aaa 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -184,6 +184,8 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) + dta@data[["PSM_match_ID"]] <- -999 + for ( i in rownames(m$match.matrix) ) { trt_id <- i cnt_id <- m$match.matrix[i,] diff --git a/R/loadLibs.R b/R/loadLibs.R index 943400e..6facdfc 100644 --- a/R/loadLibs.R +++ b/R/loadLibs.R @@ -22,5 +22,5 @@ loadLibs <- function (x=1) { # library(data.table) library(MatchIt) - library(optmatch) + # library(optmatch) } From f72ebf2481830df699643c3fda0222860f637960 Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:48:34 -0500 Subject: [PATCH 198/212] Test matchit lib. --- R/fastNN_binary_func.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index a9b3aaa..4b7f738 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -189,10 +189,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { for ( i in rownames(m$match.matrix) ) { trt_id <- i cnt_id <- m$match.matrix[i,] - pair_id <- paste(trt_id,cnt_id, sep='__') - - dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id - + + if ( !is.na(cnt_id)) ) { + pair_id <- paste(trt_id,cnt_id, sep='__') + dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id + } } From 4b668441a72e3a2191f4fe63c19f09a64a9608aa Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:48:58 -0500 Subject: [PATCH 199/212] Test matchit lib. --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 4b7f738..cebf804 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -190,7 +190,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { trt_id <- i cnt_id <- m$match.matrix[i,] - if ( !is.na(cnt_id)) ) { + if ( !is.na(cnt_id) ) { pair_id <- paste(trt_id,cnt_id, sep='__') dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id } From 4af586e8cb0c36b3449644e422ea7c49ac03f738 Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:55:54 -0500 Subject: [PATCH 200/212] Test matchit lib. --- R/fastNN_binary_func.R | 234 ++++++++++++++++++++--------------------- 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index cebf804..5592541 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -165,194 +165,194 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - print('nn') + # print('nn') - # make sure there are at least x rows in both treated and control - row_min <- 30 - trt_rows <- dta@data$TrtBin == 1 - if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { - return('drop') - } + # # make sure there are at least x rows in both treated and control + # row_min <- 30 + # trt_rows <- dta@data$TrtBin == 1 + # if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { + # return('drop') + # } - match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] + # match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] - rownames(match_data) <- match_data[[ids]] + # rownames(match_data) <- match_data[[ids]] - zzz <<- match_data + # zzz <<- match_data - m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) + # m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) - dta@data[["PSM_match_ID"]] <- -999 + # dta@data[["PSM_match_ID"]] <- -999 - for ( i in rownames(m$match.matrix) ) { - trt_id <- i - cnt_id <- m$match.matrix[i,] + # for ( i in rownames(m$match.matrix) ) { + # trt_id <- i + # cnt_id <- m$match.matrix[i,] - if ( !is.na(cnt_id) ) { - pair_id <- paste(trt_id,cnt_id, sep='__') - dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id - } - } + # if ( !is.na(cnt_id) ) { + # pair_id <- paste(trt_id,cnt_id, sep='__') + # dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id + # } + # } # =========================================================================== - # print("nn1.0") + print("nn1.0") - # #Fast nearest neighbors search - will not arrive at optimum, - # #but this may not be an issue for many analysis. - # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + #Fast nearest neighbors search - will not arrive at optimum, + #but this may not be an issue for many analysis. + #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - # sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - # #Conduct the matching - # treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] - # untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + #Conduct the matching + treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] - # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - # if (it_cnt < 30) { - # return('drop') - # } + if (it_cnt < 30) { + return('drop') + } - # # dta@data[["match"]] <- -999 - # # dta@data[["PSM_distance"]] <- -999 - # dta@data[["PSM_match_ID"]] <- -999 + # dta@data[["match"]] <- -999 + # dta@data[["PSM_distance"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 - # # print("nn2") + # print("nn2") - # #Calculate a distance decay function - # #to perturb pairs based on their distances. - # for (j in 1:it_cnt) { - # # time_list <- c() + #Calculate a distance decay function + #to perturb pairs based on their distances. + for (j in 1:it_cnt) { + # time_list <- c() - # print(paste("nn cnt:",j)) - # # timer <- proc.time() + print(paste("nn cnt:",j)) + # timer <- proc.time() - # # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - # # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - # # time_list[1] <- round((proc.time() - timer)[3],5) - # # print("nn2.1") - # # timer <- proc.time() + # time_list[1] <- round((proc.time() - timer)[3],5) + # print("nn2.1") + # timer <- proc.time() - # #Run the KNN for all neighbors. - # # print(length(treated[[1]])) - # # summary(treated[["PSM_trtProb"]]) - # # print(length(untreated[[1]])) - # # summary(untreated[["PSM_trtProb"]]) + #Run the KNN for all neighbors. + # print(length(treated[[1]])) + # summary(treated[["PSM_trtProb"]]) + # print(length(untreated[[1]])) + # summary(untreated[["PSM_trtProb"]]) - # k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - # # time_list[2] <- round((proc.time() - timer)[3],5) - # # print("nn2.2") - # # timer <- proc.time() + # time_list[2] <- round((proc.time() - timer)[3],5) + # print("nn2.2") + # timer <- proc.time() - # # #Perturb the values based on the distance decay function, if selected. - # # if (!is.null(dist_PSM)) { - # # for (mC in 1:length(k[[1]])) { + # #Perturb the values based on the distance decay function, if selected. + # if (!is.null(dist_PSM)) { + # for (mC in 1:length(k[[1]])) { - # # print("nn2.2.0") + # print("nn2.2.0") - # # #Calculate the Euclidean Distance between pairs - # # Control_ID = toString(untreated[[ids]][[mC]]) + # #Calculate the Euclidean Distance between pairs + # Control_ID = toString(untreated[[ids]][[mC]]) - # # mT = k[["nn.index"]][mC] + # mT = k[["nn.index"]][mC] - # # Treatment_ID = toString(treated[[ids]][[mT]]) + # Treatment_ID = toString(treated[[ids]][[mT]]) - # # #Find the control x,y location - # # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # #Find the control x,y location + # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - # # #Find the treatment x,y location - # # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # #Find the treatment x,y location + # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - # # y_dist = abs(cCoord[1] - cCoord[2]) - # # x_dist = abs(tCoord[1] - tCoord[2]) - # # euc_dist = sqrt(y_dist^2 + x_dist^2) + # y_dist = abs(cCoord[1] - cCoord[2]) + # x_dist = abs(tCoord[1] - tCoord[2]) + # euc_dist = sqrt(y_dist^2 + x_dist^2) - # # print("nn2.2.1") + # print("nn2.2.1") - # # PSM_score = k[["nn.dist"]][mC] - # # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # PSM_score = k[["nn.dist"]][mC] + # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # # print("nn2.2.2") + # print("nn2.2.2") - # # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - # # } + # } - # # } + # } - # # time_list[3] <- round((proc.time() - timer)[3],5) - # print("nn2.3") - # # timer <- proc.time() + # time_list[3] <- round((proc.time() - timer)[3],5) + print("nn2.3") + # timer <- proc.time() - # #Add the matched treatment and control values to the recording data frame - # #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - # best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + #Add the matched treatment and control values to the recording data frame + #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - # #This will give us the matched index in the "treated" dataset. - # best_m_treated = k[["nn.index"]][best_m_control] + #This will give us the matched index in the "treated" dataset. + best_m_treated = k[["nn.index"]][best_m_control] - # # #Control PSM ID - # # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # # Control_ID = toString(eval(parse(text=cid_txt))) + # #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) - # # #Treatment PSM ID - # # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # # Treatment_ID = toString(eval(parse(text=tid_txt))) + # #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) - # #Control PSM ID and Treatment PSM ID - # Control_ID = toString(untreated[[ids]][best_m_control]) - # Treatment_ID = toString(treated[[ids]][best_m_treated]) + #Control PSM ID and Treatment PSM ID + Control_ID = toString(untreated[[ids]][best_m_control]) + Treatment_ID = toString(treated[[ids]][best_m_treated]) - # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - # pair_id = paste(curgrp,j, sep="") + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + pair_id = paste(curgrp,j, sep="") - # # time_list[4] <- round((proc.time() - timer)[3],5) - # print("nn2.4x") - # # timer <- proc.time() + # time_list[4] <- round((proc.time() - timer)[3],5) + print("nn2.4x") + # timer <- proc.time() - # #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - # # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - # dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - # # time_list[5] <- round((proc.time() - timer)[3],5) - # # timer <- proc.time() + # time_list[5] <- round((proc.time() - timer)[3],5) + # timer <- proc.time() - # #Drop the paired match out of the iteration matrix - # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + #Drop the paired match out of the iteration matrix + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - # treated <- treated[which(treated[[ids]] != Treatment_ID),] - # untreated <- untreated[which(untreated[[ids]] != Control_ID),] + treated <- treated[which(treated[[ids]] != Treatment_ID),] + untreated <- untreated[which(untreated[[ids]] != Control_ID),] - # # time_list[6] <- round((proc.time() - timer)[3],5) - # # print(paste(time_list)) + # time_list[6] <- round((proc.time() - timer)[3],5) + # print(paste(time_list)) - # } + } return(dta) From 32cce26632157766f8a2e09af71b72a6a2a7f195 Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 15:57:21 -0500 Subject: [PATCH 201/212] Test matchit lib. --- R/fastNN_binary_func.R | 234 ++++++++++++++++++++--------------------- 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 5592541..cebf804 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -165,194 +165,194 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - # print('nn') + print('nn') - # # make sure there are at least x rows in both treated and control - # row_min <- 30 - # trt_rows <- dta@data$TrtBin == 1 - # if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { - # return('drop') - # } + # make sure there are at least x rows in both treated and control + row_min <- 30 + trt_rows <- dta@data$TrtBin == 1 + if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { + return('drop') + } - # match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] + match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] - # rownames(match_data) <- match_data[[ids]] + rownames(match_data) <- match_data[[ids]] - # zzz <<- match_data + zzz <<- match_data - # m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) + m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) - # dta@data[["PSM_match_ID"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 - # for ( i in rownames(m$match.matrix) ) { - # trt_id <- i - # cnt_id <- m$match.matrix[i,] + for ( i in rownames(m$match.matrix) ) { + trt_id <- i + cnt_id <- m$match.matrix[i,] - # if ( !is.na(cnt_id) ) { - # pair_id <- paste(trt_id,cnt_id, sep='__') - # dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id - # } - # } + if ( !is.na(cnt_id) ) { + pair_id <- paste(trt_id,cnt_id, sep='__') + dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id + } + } # =========================================================================== - print("nn1.0") + # print("nn1.0") - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + # #Fast nearest neighbors search - will not arrive at optimum, + # #but this may not be an issue for many analysis. + # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + # sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - #Conduct the matching - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] - untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + # #Conduct the matching + # treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + # untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - if (it_cnt < 30) { - return('drop') - } + # if (it_cnt < 30) { + # return('drop') + # } - # dta@data[["match"]] <- -999 - # dta@data[["PSM_distance"]] <- -999 - dta@data[["PSM_match_ID"]] <- -999 + # # dta@data[["match"]] <- -999 + # # dta@data[["PSM_distance"]] <- -999 + # dta@data[["PSM_match_ID"]] <- -999 - # print("nn2") + # # print("nn2") - #Calculate a distance decay function - #to perturb pairs based on their distances. - for (j in 1:it_cnt) { - # time_list <- c() + # #Calculate a distance decay function + # #to perturb pairs based on their distances. + # for (j in 1:it_cnt) { + # # time_list <- c() - print(paste("nn cnt:",j)) - # timer <- proc.time() + # print(paste("nn cnt:",j)) + # # timer <- proc.time() - # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - # time_list[1] <- round((proc.time() - timer)[3],5) - # print("nn2.1") - # timer <- proc.time() + # # time_list[1] <- round((proc.time() - timer)[3],5) + # # print("nn2.1") + # # timer <- proc.time() - #Run the KNN for all neighbors. - # print(length(treated[[1]])) - # summary(treated[["PSM_trtProb"]]) - # print(length(untreated[[1]])) - # summary(untreated[["PSM_trtProb"]]) + # #Run the KNN for all neighbors. + # # print(length(treated[[1]])) + # # summary(treated[["PSM_trtProb"]]) + # # print(length(untreated[[1]])) + # # summary(untreated[["PSM_trtProb"]]) - k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + # k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - # time_list[2] <- round((proc.time() - timer)[3],5) - # print("nn2.2") - # timer <- proc.time() + # # time_list[2] <- round((proc.time() - timer)[3],5) + # # print("nn2.2") + # # timer <- proc.time() - # #Perturb the values based on the distance decay function, if selected. - # if (!is.null(dist_PSM)) { - # for (mC in 1:length(k[[1]])) { + # # #Perturb the values based on the distance decay function, if selected. + # # if (!is.null(dist_PSM)) { + # # for (mC in 1:length(k[[1]])) { - # print("nn2.2.0") + # # print("nn2.2.0") - # #Calculate the Euclidean Distance between pairs - # Control_ID = toString(untreated[[ids]][[mC]]) + # # #Calculate the Euclidean Distance between pairs + # # Control_ID = toString(untreated[[ids]][[mC]]) - # mT = k[["nn.index"]][mC] + # # mT = k[["nn.index"]][mC] - # Treatment_ID = toString(treated[[ids]][[mT]]) + # # Treatment_ID = toString(treated[[ids]][[mT]]) - # #Find the control x,y location - # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # # #Find the control x,y location + # # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - # #Find the treatment x,y location - # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # # #Find the treatment x,y location + # # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - # y_dist = abs(cCoord[1] - cCoord[2]) - # x_dist = abs(tCoord[1] - tCoord[2]) - # euc_dist = sqrt(y_dist^2 + x_dist^2) + # # y_dist = abs(cCoord[1] - cCoord[2]) + # # x_dist = abs(tCoord[1] - tCoord[2]) + # # euc_dist = sqrt(y_dist^2 + x_dist^2) - # print("nn2.2.1") + # # print("nn2.2.1") - # PSM_score = k[["nn.dist"]][mC] - # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # # PSM_score = k[["nn.dist"]][mC] + # # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # print("nn2.2.2") + # # print("nn2.2.2") - # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - # } + # # } - # } + # # } - # time_list[3] <- round((proc.time() - timer)[3],5) - print("nn2.3") - # timer <- proc.time() + # # time_list[3] <- round((proc.time() - timer)[3],5) + # print("nn2.3") + # # timer <- proc.time() - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + # #Add the matched treatment and control values to the recording data frame + # #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + # best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - #This will give us the matched index in the "treated" dataset. - best_m_treated = k[["nn.index"]][best_m_control] + # #This will give us the matched index in the "treated" dataset. + # best_m_treated = k[["nn.index"]][best_m_control] - # #Control PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) + # # #Control PSM ID + # # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # # Control_ID = toString(eval(parse(text=cid_txt))) - # #Treatment PSM ID - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) + # # #Treatment PSM ID + # # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # # Treatment_ID = toString(eval(parse(text=tid_txt))) - #Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[[ids]][best_m_control]) - Treatment_ID = toString(treated[[ids]][best_m_treated]) + # #Control PSM ID and Treatment PSM ID + # Control_ID = toString(untreated[[ids]][best_m_control]) + # Treatment_ID = toString(treated[[ids]][best_m_treated]) - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j, sep="") + # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + # pair_id = paste(curgrp,j, sep="") - # time_list[4] <- round((proc.time() - timer)[3],5) - print("nn2.4x") - # timer <- proc.time() + # # time_list[4] <- round((proc.time() - timer)[3],5) + # print("nn2.4x") + # # timer <- proc.time() - #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + # #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + # # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + # dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - # time_list[5] <- round((proc.time() - timer)[3],5) - # timer <- proc.time() + # # time_list[5] <- round((proc.time() - timer)[3],5) + # # timer <- proc.time() - #Drop the paired match out of the iteration matrix - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + # #Drop the paired match out of the iteration matrix + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - treated <- treated[which(treated[[ids]] != Treatment_ID),] - untreated <- untreated[which(untreated[[ids]] != Control_ID),] + # treated <- treated[which(treated[[ids]] != Treatment_ID),] + # untreated <- untreated[which(untreated[[ids]] != Control_ID),] - # time_list[6] <- round((proc.time() - timer)[3],5) - # print(paste(time_list)) + # # time_list[6] <- round((proc.time() - timer)[3],5) + # # print(paste(time_list)) - } + # } return(dta) From c37946e0ced0b4d000e3a38ef54893af7862ce55 Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 16:16:33 -0500 Subject: [PATCH 202/212] Test matchit lib. --- R/fastNN_binary_func.R | 234 ++++++++++++++++++++--------------------- 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index cebf804..5592541 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -165,194 +165,194 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - print('nn') + # print('nn') - # make sure there are at least x rows in both treated and control - row_min <- 30 - trt_rows <- dta@data$TrtBin == 1 - if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { - return('drop') - } + # # make sure there are at least x rows in both treated and control + # row_min <- 30 + # trt_rows <- dta@data$TrtBin == 1 + # if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { + # return('drop') + # } - match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] + # match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] - rownames(match_data) <- match_data[[ids]] + # rownames(match_data) <- match_data[[ids]] - zzz <<- match_data + # zzz <<- match_data - m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) + # m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) - dta@data[["PSM_match_ID"]] <- -999 + # dta@data[["PSM_match_ID"]] <- -999 - for ( i in rownames(m$match.matrix) ) { - trt_id <- i - cnt_id <- m$match.matrix[i,] + # for ( i in rownames(m$match.matrix) ) { + # trt_id <- i + # cnt_id <- m$match.matrix[i,] - if ( !is.na(cnt_id) ) { - pair_id <- paste(trt_id,cnt_id, sep='__') - dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id - } - } + # if ( !is.na(cnt_id) ) { + # pair_id <- paste(trt_id,cnt_id, sep='__') + # dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id + # } + # } # =========================================================================== - # print("nn1.0") + print("nn1.0") - # #Fast nearest neighbors search - will not arrive at optimum, - # #but this may not be an issue for many analysis. - # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + #Fast nearest neighbors search - will not arrive at optimum, + #but this may not be an issue for many analysis. + #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - # sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - # #Conduct the matching - # treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] - # untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + #Conduct the matching + treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] - # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - # if (it_cnt < 30) { - # return('drop') - # } + if (it_cnt < 30) { + return('drop') + } - # # dta@data[["match"]] <- -999 - # # dta@data[["PSM_distance"]] <- -999 - # dta@data[["PSM_match_ID"]] <- -999 + # dta@data[["match"]] <- -999 + # dta@data[["PSM_distance"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 - # # print("nn2") + # print("nn2") - # #Calculate a distance decay function - # #to perturb pairs based on their distances. - # for (j in 1:it_cnt) { - # # time_list <- c() + #Calculate a distance decay function + #to perturb pairs based on their distances. + for (j in 1:it_cnt) { + # time_list <- c() - # print(paste("nn cnt:",j)) - # # timer <- proc.time() + print(paste("nn cnt:",j)) + # timer <- proc.time() - # # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - # # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - # # time_list[1] <- round((proc.time() - timer)[3],5) - # # print("nn2.1") - # # timer <- proc.time() + # time_list[1] <- round((proc.time() - timer)[3],5) + # print("nn2.1") + # timer <- proc.time() - # #Run the KNN for all neighbors. - # # print(length(treated[[1]])) - # # summary(treated[["PSM_trtProb"]]) - # # print(length(untreated[[1]])) - # # summary(untreated[["PSM_trtProb"]]) + #Run the KNN for all neighbors. + # print(length(treated[[1]])) + # summary(treated[["PSM_trtProb"]]) + # print(length(untreated[[1]])) + # summary(untreated[["PSM_trtProb"]]) - # k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - # # time_list[2] <- round((proc.time() - timer)[3],5) - # # print("nn2.2") - # # timer <- proc.time() + # time_list[2] <- round((proc.time() - timer)[3],5) + # print("nn2.2") + # timer <- proc.time() - # # #Perturb the values based on the distance decay function, if selected. - # # if (!is.null(dist_PSM)) { - # # for (mC in 1:length(k[[1]])) { + # #Perturb the values based on the distance decay function, if selected. + # if (!is.null(dist_PSM)) { + # for (mC in 1:length(k[[1]])) { - # # print("nn2.2.0") + # print("nn2.2.0") - # # #Calculate the Euclidean Distance between pairs - # # Control_ID = toString(untreated[[ids]][[mC]]) + # #Calculate the Euclidean Distance between pairs + # Control_ID = toString(untreated[[ids]][[mC]]) - # # mT = k[["nn.index"]][mC] + # mT = k[["nn.index"]][mC] - # # Treatment_ID = toString(treated[[ids]][[mT]]) + # Treatment_ID = toString(treated[[ids]][[mT]]) - # # #Find the control x,y location - # # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # #Find the control x,y location + # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - # # #Find the treatment x,y location - # # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # #Find the treatment x,y location + # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - # # y_dist = abs(cCoord[1] - cCoord[2]) - # # x_dist = abs(tCoord[1] - tCoord[2]) - # # euc_dist = sqrt(y_dist^2 + x_dist^2) + # y_dist = abs(cCoord[1] - cCoord[2]) + # x_dist = abs(tCoord[1] - tCoord[2]) + # euc_dist = sqrt(y_dist^2 + x_dist^2) - # # print("nn2.2.1") + # print("nn2.2.1") - # # PSM_score = k[["nn.dist"]][mC] - # # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # PSM_score = k[["nn.dist"]][mC] + # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # # print("nn2.2.2") + # print("nn2.2.2") - # # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - # # } + # } - # # } + # } - # # time_list[3] <- round((proc.time() - timer)[3],5) - # print("nn2.3") - # # timer <- proc.time() + # time_list[3] <- round((proc.time() - timer)[3],5) + print("nn2.3") + # timer <- proc.time() - # #Add the matched treatment and control values to the recording data frame - # #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - # best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + #Add the matched treatment and control values to the recording data frame + #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - # #This will give us the matched index in the "treated" dataset. - # best_m_treated = k[["nn.index"]][best_m_control] + #This will give us the matched index in the "treated" dataset. + best_m_treated = k[["nn.index"]][best_m_control] - # # #Control PSM ID - # # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # # Control_ID = toString(eval(parse(text=cid_txt))) + # #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) - # # #Treatment PSM ID - # # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # # Treatment_ID = toString(eval(parse(text=tid_txt))) + # #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) - # #Control PSM ID and Treatment PSM ID - # Control_ID = toString(untreated[[ids]][best_m_control]) - # Treatment_ID = toString(treated[[ids]][best_m_treated]) + #Control PSM ID and Treatment PSM ID + Control_ID = toString(untreated[[ids]][best_m_control]) + Treatment_ID = toString(treated[[ids]][best_m_treated]) - # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - # pair_id = paste(curgrp,j, sep="") + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + pair_id = paste(curgrp,j, sep="") - # # time_list[4] <- round((proc.time() - timer)[3],5) - # print("nn2.4x") - # # timer <- proc.time() + # time_list[4] <- round((proc.time() - timer)[3],5) + print("nn2.4x") + # timer <- proc.time() - # #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - # # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - # dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - # # time_list[5] <- round((proc.time() - timer)[3],5) - # # timer <- proc.time() + # time_list[5] <- round((proc.time() - timer)[3],5) + # timer <- proc.time() - # #Drop the paired match out of the iteration matrix - # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + #Drop the paired match out of the iteration matrix + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - # treated <- treated[which(treated[[ids]] != Treatment_ID),] - # untreated <- untreated[which(untreated[[ids]] != Control_ID),] + treated <- treated[which(treated[[ids]] != Treatment_ID),] + untreated <- untreated[which(untreated[[ids]] != Control_ID),] - # # time_list[6] <- round((proc.time() - timer)[3],5) - # # print(paste(time_list)) + # time_list[6] <- round((proc.time() - timer)[3],5) + # print(paste(time_list)) - # } + } return(dta) From b0d2bc7063f3b4eaba502962eef77be1b928438e Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 16:28:53 -0500 Subject: [PATCH 203/212] Test matchit lib. --- R/fastNN_binary_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 5592541..f1ea1c6 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -207,7 +207,7 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { #but this may not be an issue for many analysis. #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + sorted_dta <- dta@data[order(-dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] #Conduct the matching From d266959be9e7d885b6a6dc96ceb901b6e30c3dbc Mon Sep 17 00:00:00 2001 From: userz Date: Thu, 12 Nov 2015 16:47:51 -0500 Subject: [PATCH 204/212] Test matchit lib. --- R/fastNN_binary_func.R | 234 ++++++++++++++++++++--------------------- 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index f1ea1c6..efe84f1 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -165,194 +165,194 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - # print('nn') + print('nn') - # # make sure there are at least x rows in both treated and control - # row_min <- 30 - # trt_rows <- dta@data$TrtBin == 1 - # if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { - # return('drop') - # } + # make sure there are at least x rows in both treated and control + row_min <- 30 + trt_rows <- dta@data$TrtBin == 1 + if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { + return('drop') + } - # match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] + match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] - # rownames(match_data) <- match_data[[ids]] + rownames(match_data) <- match_data[[ids]] - # zzz <<- match_data + zzz <<- match_data - # m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) + m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) - # dta@data[["PSM_match_ID"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 - # for ( i in rownames(m$match.matrix) ) { - # trt_id <- i - # cnt_id <- m$match.matrix[i,] + for ( i in rownames(m$match.matrix) ) { + trt_id <- i + cnt_id <- m$match.matrix[i,] - # if ( !is.na(cnt_id) ) { - # pair_id <- paste(trt_id,cnt_id, sep='__') - # dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id - # } - # } + if ( !is.na(cnt_id) ) { + pair_id <- paste(trt_id,cnt_id, sep='__') + dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id + } + } # =========================================================================== - print("nn1.0") + # print("nn1.0") - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + # #Fast nearest neighbors search - will not arrive at optimum, + # #but this may not be an issue for many analysis. + # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(-dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + # sorted_dta <- dta@data[order(-dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - #Conduct the matching - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] - untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + # #Conduct the matching + # treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + # untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - if (it_cnt < 30) { - return('drop') - } + # if (it_cnt < 30) { + # return('drop') + # } - # dta@data[["match"]] <- -999 - # dta@data[["PSM_distance"]] <- -999 - dta@data[["PSM_match_ID"]] <- -999 + # # dta@data[["match"]] <- -999 + # # dta@data[["PSM_distance"]] <- -999 + # dta@data[["PSM_match_ID"]] <- -999 - # print("nn2") + # # print("nn2") - #Calculate a distance decay function - #to perturb pairs based on their distances. - for (j in 1:it_cnt) { - # time_list <- c() + # #Calculate a distance decay function + # #to perturb pairs based on their distances. + # for (j in 1:it_cnt) { + # # time_list <- c() - print(paste("nn cnt:",j)) - # timer <- proc.time() + # print(paste("nn cnt:",j)) + # # timer <- proc.time() - # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - # time_list[1] <- round((proc.time() - timer)[3],5) - # print("nn2.1") - # timer <- proc.time() + # # time_list[1] <- round((proc.time() - timer)[3],5) + # # print("nn2.1") + # # timer <- proc.time() - #Run the KNN for all neighbors. - # print(length(treated[[1]])) - # summary(treated[["PSM_trtProb"]]) - # print(length(untreated[[1]])) - # summary(untreated[["PSM_trtProb"]]) + # #Run the KNN for all neighbors. + # # print(length(treated[[1]])) + # # summary(treated[["PSM_trtProb"]]) + # # print(length(untreated[[1]])) + # # summary(untreated[["PSM_trtProb"]]) - k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + # k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - # time_list[2] <- round((proc.time() - timer)[3],5) - # print("nn2.2") - # timer <- proc.time() + # # time_list[2] <- round((proc.time() - timer)[3],5) + # # print("nn2.2") + # # timer <- proc.time() - # #Perturb the values based on the distance decay function, if selected. - # if (!is.null(dist_PSM)) { - # for (mC in 1:length(k[[1]])) { + # # #Perturb the values based on the distance decay function, if selected. + # # if (!is.null(dist_PSM)) { + # # for (mC in 1:length(k[[1]])) { - # print("nn2.2.0") + # # print("nn2.2.0") - # #Calculate the Euclidean Distance between pairs - # Control_ID = toString(untreated[[ids]][[mC]]) + # # #Calculate the Euclidean Distance between pairs + # # Control_ID = toString(untreated[[ids]][[mC]]) - # mT = k[["nn.index"]][mC] + # # mT = k[["nn.index"]][mC] - # Treatment_ID = toString(treated[[ids]][[mT]]) + # # Treatment_ID = toString(treated[[ids]][[mT]]) - # #Find the control x,y location - # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # # #Find the control x,y location + # # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - # #Find the treatment x,y location - # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # # #Find the treatment x,y location + # # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - # y_dist = abs(cCoord[1] - cCoord[2]) - # x_dist = abs(tCoord[1] - tCoord[2]) - # euc_dist = sqrt(y_dist^2 + x_dist^2) + # # y_dist = abs(cCoord[1] - cCoord[2]) + # # x_dist = abs(tCoord[1] - tCoord[2]) + # # euc_dist = sqrt(y_dist^2 + x_dist^2) - # print("nn2.2.1") + # # print("nn2.2.1") - # PSM_score = k[["nn.dist"]][mC] - # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # # PSM_score = k[["nn.dist"]][mC] + # # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # print("nn2.2.2") + # # print("nn2.2.2") - # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - # } + # # } - # } + # # } - # time_list[3] <- round((proc.time() - timer)[3],5) - print("nn2.3") - # timer <- proc.time() + # # time_list[3] <- round((proc.time() - timer)[3],5) + # print("nn2.3") + # # timer <- proc.time() - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + # #Add the matched treatment and control values to the recording data frame + # #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + # best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - #This will give us the matched index in the "treated" dataset. - best_m_treated = k[["nn.index"]][best_m_control] + # #This will give us the matched index in the "treated" dataset. + # best_m_treated = k[["nn.index"]][best_m_control] - # #Control PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) + # # #Control PSM ID + # # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # # Control_ID = toString(eval(parse(text=cid_txt))) - # #Treatment PSM ID - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) + # # #Treatment PSM ID + # # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # # Treatment_ID = toString(eval(parse(text=tid_txt))) - #Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[[ids]][best_m_control]) - Treatment_ID = toString(treated[[ids]][best_m_treated]) + # #Control PSM ID and Treatment PSM ID + # Control_ID = toString(untreated[[ids]][best_m_control]) + # Treatment_ID = toString(treated[[ids]][best_m_treated]) - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j, sep="") + # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + # pair_id = paste(curgrp,j, sep="") - # time_list[4] <- round((proc.time() - timer)[3],5) - print("nn2.4x") - # timer <- proc.time() + # # time_list[4] <- round((proc.time() - timer)[3],5) + # print("nn2.4x") + # # timer <- proc.time() - #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + # #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + # # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + # dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - # time_list[5] <- round((proc.time() - timer)[3],5) - # timer <- proc.time() + # # time_list[5] <- round((proc.time() - timer)[3],5) + # # timer <- proc.time() - #Drop the paired match out of the iteration matrix - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + # #Drop the paired match out of the iteration matrix + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - treated <- treated[which(treated[[ids]] != Treatment_ID),] - untreated <- untreated[which(untreated[[ids]] != Control_ID),] + # treated <- treated[which(treated[[ids]] != Treatment_ID),] + # untreated <- untreated[which(untreated[[ids]] != Control_ID),] - # time_list[6] <- round((proc.time() - timer)[3],5) - # print(paste(time_list)) + # # time_list[6] <- round((proc.time() - timer)[3],5) + # # print(paste(time_list)) - } + # } return(dta) From ede4adc702310bad6ee9829ce1601b760a5a8655 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 13 Nov 2015 09:06:05 -0500 Subject: [PATCH 205/212] Test matchit lib. --- R/SAT.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index ac3f97c..8f9812b 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -386,7 +386,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } - + vvv <<- dta print("sat5") From a38e71f714d536687b18ae3d98827e89428f73ae Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 13 Nov 2015 09:10:19 -0500 Subject: [PATCH 206/212] Test matchit lib. --- R/fastNN_binary_func.R | 234 ++++++++++++++++++++--------------------- 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index efe84f1..f1ea1c6 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -165,194 +165,194 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - print('nn') + # print('nn') - # make sure there are at least x rows in both treated and control - row_min <- 30 - trt_rows <- dta@data$TrtBin == 1 - if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { - return('drop') - } + # # make sure there are at least x rows in both treated and control + # row_min <- 30 + # trt_rows <- dta@data$TrtBin == 1 + # if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { + # return('drop') + # } - match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] + # match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] - rownames(match_data) <- match_data[[ids]] + # rownames(match_data) <- match_data[[ids]] - zzz <<- match_data + # zzz <<- match_data - m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) + # m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) - dta@data[["PSM_match_ID"]] <- -999 + # dta@data[["PSM_match_ID"]] <- -999 - for ( i in rownames(m$match.matrix) ) { - trt_id <- i - cnt_id <- m$match.matrix[i,] + # for ( i in rownames(m$match.matrix) ) { + # trt_id <- i + # cnt_id <- m$match.matrix[i,] - if ( !is.na(cnt_id) ) { - pair_id <- paste(trt_id,cnt_id, sep='__') - dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id - } - } + # if ( !is.na(cnt_id) ) { + # pair_id <- paste(trt_id,cnt_id, sep='__') + # dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id + # } + # } # =========================================================================== - # print("nn1.0") + print("nn1.0") - # #Fast nearest neighbors search - will not arrive at optimum, - # #but this may not be an issue for many analysis. - # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + #Fast nearest neighbors search - will not arrive at optimum, + #but this may not be an issue for many analysis. + #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - # sorted_dta <- dta@data[order(-dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + sorted_dta <- dta@data[order(-dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - # #Conduct the matching - # treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] - # untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + #Conduct the matching + treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] - # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - # if (it_cnt < 30) { - # return('drop') - # } + if (it_cnt < 30) { + return('drop') + } - # # dta@data[["match"]] <- -999 - # # dta@data[["PSM_distance"]] <- -999 - # dta@data[["PSM_match_ID"]] <- -999 + # dta@data[["match"]] <- -999 + # dta@data[["PSM_distance"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 - # # print("nn2") + # print("nn2") - # #Calculate a distance decay function - # #to perturb pairs based on their distances. - # for (j in 1:it_cnt) { - # # time_list <- c() + #Calculate a distance decay function + #to perturb pairs based on their distances. + for (j in 1:it_cnt) { + # time_list <- c() - # print(paste("nn cnt:",j)) - # # timer <- proc.time() + print(paste("nn cnt:",j)) + # timer <- proc.time() - # # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - # # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - # # time_list[1] <- round((proc.time() - timer)[3],5) - # # print("nn2.1") - # # timer <- proc.time() + # time_list[1] <- round((proc.time() - timer)[3],5) + # print("nn2.1") + # timer <- proc.time() - # #Run the KNN for all neighbors. - # # print(length(treated[[1]])) - # # summary(treated[["PSM_trtProb"]]) - # # print(length(untreated[[1]])) - # # summary(untreated[["PSM_trtProb"]]) + #Run the KNN for all neighbors. + # print(length(treated[[1]])) + # summary(treated[["PSM_trtProb"]]) + # print(length(untreated[[1]])) + # summary(untreated[["PSM_trtProb"]]) - # k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - # # time_list[2] <- round((proc.time() - timer)[3],5) - # # print("nn2.2") - # # timer <- proc.time() + # time_list[2] <- round((proc.time() - timer)[3],5) + # print("nn2.2") + # timer <- proc.time() - # # #Perturb the values based on the distance decay function, if selected. - # # if (!is.null(dist_PSM)) { - # # for (mC in 1:length(k[[1]])) { + # #Perturb the values based on the distance decay function, if selected. + # if (!is.null(dist_PSM)) { + # for (mC in 1:length(k[[1]])) { - # # print("nn2.2.0") + # print("nn2.2.0") - # # #Calculate the Euclidean Distance between pairs - # # Control_ID = toString(untreated[[ids]][[mC]]) + # #Calculate the Euclidean Distance between pairs + # Control_ID = toString(untreated[[ids]][[mC]]) - # # mT = k[["nn.index"]][mC] + # mT = k[["nn.index"]][mC] - # # Treatment_ID = toString(treated[[ids]][[mT]]) + # Treatment_ID = toString(treated[[ids]][[mT]]) - # # #Find the control x,y location - # # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # #Find the control x,y location + # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - # # #Find the treatment x,y location - # # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # #Find the treatment x,y location + # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - # # y_dist = abs(cCoord[1] - cCoord[2]) - # # x_dist = abs(tCoord[1] - tCoord[2]) - # # euc_dist = sqrt(y_dist^2 + x_dist^2) + # y_dist = abs(cCoord[1] - cCoord[2]) + # x_dist = abs(tCoord[1] - tCoord[2]) + # euc_dist = sqrt(y_dist^2 + x_dist^2) - # # print("nn2.2.1") + # print("nn2.2.1") - # # PSM_score = k[["nn.dist"]][mC] - # # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # PSM_score = k[["nn.dist"]][mC] + # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # # print("nn2.2.2") + # print("nn2.2.2") - # # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - # # } + # } - # # } + # } - # # time_list[3] <- round((proc.time() - timer)[3],5) - # print("nn2.3") - # # timer <- proc.time() + # time_list[3] <- round((proc.time() - timer)[3],5) + print("nn2.3") + # timer <- proc.time() - # #Add the matched treatment and control values to the recording data frame - # #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - # best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + #Add the matched treatment and control values to the recording data frame + #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - # #This will give us the matched index in the "treated" dataset. - # best_m_treated = k[["nn.index"]][best_m_control] + #This will give us the matched index in the "treated" dataset. + best_m_treated = k[["nn.index"]][best_m_control] - # # #Control PSM ID - # # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # # Control_ID = toString(eval(parse(text=cid_txt))) + # #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) - # # #Treatment PSM ID - # # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # # Treatment_ID = toString(eval(parse(text=tid_txt))) + # #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) - # #Control PSM ID and Treatment PSM ID - # Control_ID = toString(untreated[[ids]][best_m_control]) - # Treatment_ID = toString(treated[[ids]][best_m_treated]) + #Control PSM ID and Treatment PSM ID + Control_ID = toString(untreated[[ids]][best_m_control]) + Treatment_ID = toString(treated[[ids]][best_m_treated]) - # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - # pair_id = paste(curgrp,j, sep="") + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + pair_id = paste(curgrp,j, sep="") - # # time_list[4] <- round((proc.time() - timer)[3],5) - # print("nn2.4x") - # # timer <- proc.time() + # time_list[4] <- round((proc.time() - timer)[3],5) + print("nn2.4x") + # timer <- proc.time() - # #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - # # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - # dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - # # time_list[5] <- round((proc.time() - timer)[3],5) - # # timer <- proc.time() + # time_list[5] <- round((proc.time() - timer)[3],5) + # timer <- proc.time() - # #Drop the paired match out of the iteration matrix - # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + #Drop the paired match out of the iteration matrix + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - # treated <- treated[which(treated[[ids]] != Treatment_ID),] - # untreated <- untreated[which(untreated[[ids]] != Control_ID),] + treated <- treated[which(treated[[ids]] != Treatment_ID),] + untreated <- untreated[which(untreated[[ids]] != Control_ID),] - # # time_list[6] <- round((proc.time() - timer)[3],5) - # # print(paste(time_list)) + # time_list[6] <- round((proc.time() - timer)[3],5) + # print(paste(time_list)) - # } + } return(dta) From 648eb89fed8b3eb8e23eb6a8bc59a8f1c376316a Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 13 Nov 2015 09:34:20 -0500 Subject: [PATCH 207/212] Test matchit lib. --- R/fastNN_binary_func.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index f1ea1c6..ed708c1 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -318,10 +318,11 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { Treatment_ID = toString(treated[[ids]][best_m_treated]) - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j, sep="") - + #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + # pair_id = paste(curgrp,j, sep="") + pair_id <- paste(Treatment_ID,Control_ID, sep='__') + print(pair_id) # time_list[4] <- round((proc.time() - timer)[3],5) print("nn2.4x") From 849786148368b8c727d516216599f286e36825db Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 13 Nov 2015 09:37:38 -0500 Subject: [PATCH 208/212] Test matchit lib. --- R/fastNN_binary_func.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index ed708c1..c6449ce 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -306,17 +306,23 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # #Control PSM ID - # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - # Control_ID = toString(eval(parse(text=cid_txt))) + cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + Control_ID1 = toString(eval(parse(text=cid_txt))) # #Treatment PSM ID - # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - # Treatment_ID = toString(eval(parse(text=tid_txt))) + tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + Treatment_ID1 = toString(eval(parse(text=tid_txt))) + + print(Control_ID1) + print(Treatment_ID1) + #Control PSM ID and Treatment PSM ID Control_ID = toString(untreated[[ids]][best_m_control]) Treatment_ID = toString(treated[[ids]][best_m_treated]) + print(Control_ID) + print(Treatment_ID) #Create a unique pair ID for each group (will simply append a "1" if only 1 group) From 08a63fc992df6c56fca89a57702e087b753efcb8 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 13 Nov 2015 09:39:06 -0500 Subject: [PATCH 209/212] Test matchit lib. --- R/fastNN_binary_func.R | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index c6449ce..4566410 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -307,22 +307,12 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { # #Control PSM ID cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID1 = toString(eval(parse(text=cid_txt))) + Control_ID = toString(eval(parse(text=cid_txt))) # #Treatment PSM ID tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID1 = toString(eval(parse(text=tid_txt))) + Treatment_ID = toString(eval(parse(text=tid_txt))) - print(Control_ID1) - print(Treatment_ID1) - - - #Control PSM ID and Treatment PSM ID - Control_ID = toString(untreated[[ids]][best_m_control]) - Treatment_ID = toString(treated[[ids]][best_m_treated]) - - print(Control_ID) - print(Treatment_ID) #Create a unique pair ID for each group (will simply append a "1" if only 1 group) From 128c3db6bea77e6fb843a3fffa5e2750032c0f97 Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 13 Nov 2015 12:22:02 -0500 Subject: [PATCH 210/212] Test matchit lib. --- R/fastNN_binary_func.R | 232 ++++++++++++++++++++--------------------- 1 file changed, 116 insertions(+), 116 deletions(-) diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index 4566410..b4917d2 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -165,191 +165,191 @@ fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { - # print('nn') + print('nn') - # # make sure there are at least x rows in both treated and control - # row_min <- 30 - # trt_rows <- dta@data$TrtBin == 1 - # if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { - # return('drop') - # } + # make sure there are at least x rows in both treated and control + row_min <- 30 + trt_rows <- dta@data$TrtBin == 1 + if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { + return('drop') + } - # match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] + match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] - # rownames(match_data) <- match_data[[ids]] + rownames(match_data) <- match_data[[ids]] - # zzz <<- match_data + zzz <<- match_data - # m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) + m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) - # dta@data[["PSM_match_ID"]] <- -999 + dta@data[["PSM_match_ID"]] <- -999 - # for ( i in rownames(m$match.matrix) ) { - # trt_id <- i - # cnt_id <- m$match.matrix[i,] + for ( i in rownames(m$match.matrix) ) { + trt_id <- i + cnt_id <- m$match.matrix[i,] - # if ( !is.na(cnt_id) ) { - # pair_id <- paste(trt_id,cnt_id, sep='__') - # dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id - # } - # } + if ( !is.na(cnt_id) ) { + pair_id <- paste(trt_id,cnt_id, sep='__') + dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id + } + } # =========================================================================== - print("nn1.0") + # print("nn1.0") - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + # #Fast nearest neighbors search - will not arrive at optimum, + # #but this may not be an issue for many analysis. + # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(-dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + # sorted_dta <- dta@data[order(-dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] - #Conduct the matching - treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] - untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + # #Conduct the matching + # treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + # untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - if (it_cnt < 30) { - return('drop') - } + # if (it_cnt < 30) { + # return('drop') + # } - # dta@data[["match"]] <- -999 - # dta@data[["PSM_distance"]] <- -999 - dta@data[["PSM_match_ID"]] <- -999 + # # dta@data[["match"]] <- -999 + # # dta@data[["PSM_distance"]] <- -999 + # dta@data[["PSM_match_ID"]] <- -999 - # print("nn2") + # # print("nn2") - #Calculate a distance decay function - #to perturb pairs based on their distances. - for (j in 1:it_cnt) { - # time_list <- c() + # #Calculate a distance decay function + # #to perturb pairs based on their distances. + # for (j in 1:it_cnt) { + # # time_list <- c() - print(paste("nn cnt:",j)) - # timer <- proc.time() + # print(paste("nn cnt:",j)) + # # timer <- proc.time() - # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] - # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] + # # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) - # time_list[1] <- round((proc.time() - timer)[3],5) - # print("nn2.1") - # timer <- proc.time() + # # time_list[1] <- round((proc.time() - timer)[3],5) + # # print("nn2.1") + # # timer <- proc.time() - #Run the KNN for all neighbors. - # print(length(treated[[1]])) - # summary(treated[["PSM_trtProb"]]) - # print(length(untreated[[1]])) - # summary(untreated[["PSM_trtProb"]]) + # #Run the KNN for all neighbors. + # # print(length(treated[[1]])) + # # summary(treated[["PSM_trtProb"]]) + # # print(length(untreated[[1]])) + # # summary(untreated[["PSM_trtProb"]]) - k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) + # k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - # time_list[2] <- round((proc.time() - timer)[3],5) - # print("nn2.2") - # timer <- proc.time() + # # time_list[2] <- round((proc.time() - timer)[3],5) + # # print("nn2.2") + # # timer <- proc.time() - # #Perturb the values based on the distance decay function, if selected. - # if (!is.null(dist_PSM)) { - # for (mC in 1:length(k[[1]])) { + # # #Perturb the values based on the distance decay function, if selected. + # # if (!is.null(dist_PSM)) { + # # for (mC in 1:length(k[[1]])) { - # print("nn2.2.0") + # # print("nn2.2.0") - # #Calculate the Euclidean Distance between pairs - # Control_ID = toString(untreated[[ids]][[mC]]) + # # #Calculate the Euclidean Distance between pairs + # # Control_ID = toString(untreated[[ids]][[mC]]) - # mT = k[["nn.index"]][mC] + # # mT = k[["nn.index"]][mC] - # Treatment_ID = toString(treated[[ids]][[mT]]) + # # Treatment_ID = toString(treated[[ids]][[mT]]) - # #Find the control x,y location - # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + # # #Find the control x,y location + # # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) - # #Find the treatment x,y location - # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + # # #Find the treatment x,y location + # # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - # y_dist = abs(cCoord[1] - cCoord[2]) - # x_dist = abs(tCoord[1] - tCoord[2]) - # euc_dist = sqrt(y_dist^2 + x_dist^2) + # # y_dist = abs(cCoord[1] - cCoord[2]) + # # x_dist = abs(tCoord[1] - tCoord[2]) + # # euc_dist = sqrt(y_dist^2 + x_dist^2) - # print("nn2.2.1") + # # print("nn2.2.1") - # PSM_score = k[["nn.dist"]][mC] - # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # # PSM_score = k[["nn.dist"]][mC] + # # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") - # print("nn2.2.2") + # # print("nn2.2.2") - # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + # # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) - # } + # # } - # } + # # } - # time_list[3] <- round((proc.time() - timer)[3],5) - print("nn2.3") - # timer <- proc.time() + # # time_list[3] <- round((proc.time() - timer)[3],5) + # print("nn2.3") + # # timer <- proc.time() - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + # #Add the matched treatment and control values to the recording data frame + # #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + # best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - #This will give us the matched index in the "treated" dataset. - best_m_treated = k[["nn.index"]][best_m_control] + # #This will give us the matched index in the "treated" dataset. + # best_m_treated = k[["nn.index"]][best_m_control] - # #Control PSM ID - cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) + # # #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) - # #Treatment PSM ID - tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) + # # #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - # pair_id = paste(curgrp,j, sep="") - pair_id <- paste(Treatment_ID,Control_ID, sep='__') - print(pair_id) + # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + # # pair_id = paste(curgrp,j, sep="") + # pair_id <- paste(Treatment_ID,Control_ID, sep='__') + # print(pair_id) - # time_list[4] <- round((proc.time() - timer)[3],5) - print("nn2.4x") - # timer <- proc.time() + # # time_list[4] <- round((proc.time() - timer)[3],5) + # print("nn2.4x") + # # timer <- proc.time() - #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row - # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID - # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + # #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID - # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] - dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + # # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + # dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - # time_list[5] <- round((proc.time() - timer)[3],5) - # timer <- proc.time() + # # time_list[5] <- round((proc.time() - timer)[3],5) + # # timer <- proc.time() - #Drop the paired match out of the iteration matrix - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] - # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + # #Drop the paired match out of the iteration matrix + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 - treated <- treated[which(treated[[ids]] != Treatment_ID),] - untreated <- untreated[which(untreated[[ids]] != Control_ID),] + # treated <- treated[which(treated[[ids]] != Treatment_ID),] + # untreated <- untreated[which(untreated[[ids]] != Control_ID),] - # time_list[6] <- round((proc.time() - timer)[3],5) - # print(paste(time_list)) + # # time_list[6] <- round((proc.time() - timer)[3],5) + # # print(paste(time_list)) - } + # } return(dta) From b9edba9e6bda334a69a51bc0e54160af82209d0c Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 6 May 2016 13:23:14 -0400 Subject: [PATCH 211/212] Fix type. --- R/SAT.R | 84 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/R/SAT.R b/R/SAT.R index 8f9812b..7b2e751 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -5,16 +5,16 @@ # #Initialization # pltObjs <- list() # init_dta <- dta - + # drop_unmatched = drop_opts["drop_unmatched"] # drop_method = drop_opts["drop_method"] # drop_thresh = as.numeric(drop_opts["drop_thresh"]) - - + + # print("sat1") -# if (!is.null(constraints) && contraints != c()) { +# if (!is.null(constraints) && constraints != c()) { # for (cst in 1:length(names(constraints))) { # if (names(constraints)[cst] == "groups") { # dta[,"ConstraintGroupSet_Opt"] <- dta[,get(constraints["groups"])] @@ -29,7 +29,7 @@ # dist_PSM=NULL # } # } - + # } else { # dta[,"ConstraintGroupSet_Opt"] <- 1 # #max the distance threshold by taking the diagonal of the bounding box. @@ -42,7 +42,7 @@ # #Caclulate the number of groups to constrain by, if any. # group_constraints <- unique(dta[,'ConstraintGroupSet_Opt'] - + # #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) # t_dta <- list() # u_dta <-list() @@ -69,7 +69,7 @@ # war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") # warning(war_statement) -# } else { +# } else { # t_dta[[t_index]] <- t_dta[[t_index]][ConstraintGroupSet_Opt == (cur_grp)] # u_dta[[t_index]] <- u_dta[[t_index]][ConstraintGroupSet_Opt == (cur_grp)] @@ -97,12 +97,12 @@ # if (mtd == "fastNN") { # # *** # # this is the slow part of functions -# temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) +# temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) # } # if (mtd == "NN_WithReplacement") { # print("NN with replacement is currently not available, please choose fastNN") -# # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) +# # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) # } # } @@ -113,7 +113,7 @@ # dta <- temp_dta[[1]] # for(k in 2:cnt) { # dta <- maptools::spRbind(dta, temp_dta[[k]]) -# } +# } # } else { # dta <- temp_dta[[1]] # } @@ -121,15 +121,15 @@ # print("sat5") # if (drop_unmatched == TRUE) { -# dta <- dta["PSM_match_ID" != -999] +# dta <- dta["PSM_match_ID" != -999] # } - + # anc_v_int <- strsplit(psm_eq, "~")[[1]][2] # anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) # anc_vars <- c(anc_vars[[1]], "PSM_trtProb") - + # print("sat6") - + # #Drop observations according to the selected method # if (drop_method == "SD") { # #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. @@ -139,7 +139,7 @@ # } # dta <- dta["PSM_distance" < psm_sd_thresh] # } - + # #Plot the pre and post-dropping balance for PSM model... @@ -149,7 +149,7 @@ # cnt = 0 # print("sat7") - + # for (i in 1:length(anc_vars)) { # print("sat7.0") @@ -174,7 +174,7 @@ # if ((c_type == "numeric") & (visual == "TRUE")) { # cnt = cnt + 1 # pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) -# pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) +# pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) # treat_mean_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[2]][[3]], 5) @@ -215,13 +215,13 @@ # treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, # it_diff_Mean_pre,it_diff_Mean_post)) # } - + # rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]", "", ed_v) # } # } # print("sat8") - + # if (visual=="TRUE") { # #Output graphics # #Remove the factor rows @@ -238,7 +238,7 @@ # #bTab <- stargazer(bRes,summary=FALSE,type="html") # #print.htmlTable(bTab) # } - + # return (as.data.frame(dta)) # } @@ -251,16 +251,16 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #Initialization pltObjs <- list() init_dta <- dta - + drop_unmatched = drop_opts["drop_unmatched"] drop_method = drop_opts["drop_method"] drop_thresh = as.numeric(drop_opts["drop_thresh"]) - - + + print("sat1") - if (!is.null(constraints) && contraints != c()) { + if (!is.null(constraints) && constraints != c()) { print("sat1a.1") @@ -284,7 +284,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # Caclulate the number of groups to constrain by, if any. group_constraints <- unique(dta$ConstraintGroupSet_Opt) - + # Make sure there are both treatment and control groups of an adequate size (>= 1 of each) t_dta <- list() u_dta <-list() @@ -311,7 +311,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") warning(war_statement) - } else { + } else { t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] @@ -338,12 +338,12 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if (mtd == "fastNN") { # *** # this is the slow part of functions - temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) + temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) } # if (mtd == "NN_WithReplacement") { # print("NN with replacement is currently not available, please choose fastNN") - # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) # } } @@ -354,12 +354,12 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo dta <- temp_dta[[1]] for(k in 2:cnt) { dta <- maptools::spRbind(dta, temp_dta[[k]]) - } + } } else { dta <- temp_dta[[1]] } - + } else { print("sat1b.1") @@ -370,7 +370,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if (mtd == "fastNN") { # *** # this is the slow part of functions - dta <- fastNN_binary_func(dta, TrtBinColName, ids, NULL, NULL) + dta <- fastNN_binary_func(dta, TrtBinColName, ids, NULL, NULL) if (class(dta) == class('drop')) { return('drop') @@ -380,7 +380,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo # if (mtd == "NN_WithReplacement") { # print("NN with replacement is currently not available, please choose fastNN") - # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) # } @@ -392,15 +392,15 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat5") if (drop_unmatched == TRUE) { - dta <- dta[dta@data[,"PSM_match_ID"] != -999,] + dta <- dta[dta@data[,"PSM_match_ID"] != -999,] } - + anc_v_int <- strsplit(psm_eq, "~")[[1]][2] anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) anc_vars <- c(anc_vars[[1]], "PSM_trtProb") - + print("sat6") - + #Drop observations according to the selected method if (drop_method == "SD") { #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. @@ -410,7 +410,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo } dta <- dta[dta@data[,"PSM_distance"] < psm_sd_thresh,] } - + #Plot the pre and post-dropping balance for PSM model... @@ -420,7 +420,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo cnt = 0 print("sat7") - + for (i in 1:length(anc_vars)) { print("sat7.0") @@ -445,7 +445,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo if ((c_type == "numeric") & (visual == "TRUE")) { cnt = cnt + 1 pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) - pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) + pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) treat_mean_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[2]][[3]], 5) @@ -486,13 +486,13 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, it_diff_Mean_pre,it_diff_Mean_post)) } - + rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]", "", ed_v) } } print("sat8") - + if (visual=="TRUE") { #Output graphics #Remove the factor rows @@ -509,7 +509,7 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo #bTab <- stargazer(bRes,summary=FALSE,type="html") #print.htmlTable(bTab) } - + return (dta) } From 530b6dd5119d0882614d8ace0f2b536a3d6298fb Mon Sep 17 00:00:00 2001 From: userz Date: Fri, 6 May 2016 13:27:29 -0400 Subject: [PATCH 212/212] Fix maybe. --- R/SAT.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/SAT.R b/R/SAT.R index 7b2e751..a3e25a6 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -260,7 +260,8 @@ SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinCo print("sat1") - if (!is.null(constraints) && constraints != c()) { + # if (!is.null(constraints) && constraints != c()) { + if (!is.null(constraints) ) { print("sat1a.1")