I am trying to implement logistic regression and the function works manually, but for some reason I get the error "Error in nrow(X) : object 'X' not found ", even though X is defined before the nrow command. I use the UCI Data "Adult" to test it.
If i try to run the function manually there is no error. Can anyone explain that?
# Sigmoidfunction
sigmoid <- function(z){
  g <- 1/(1+exp(-z))
  return(g)
}
# Costfunction
cost <- function(theta){
  n <- nrow(X)
  g <- sigmoid(X %*% theta)
  J <- (1/n)*sum((-Y*log(g)) - ((1-Y)*log(1-g)))
  return(J)
}
log_reg <- function(datafr, m){
  # Train- und Testdaten Split
  sample <- sample(1:nrow(datafr), m)
  df_train <- datafr[sample,]
  df_test <- datafr[-sample,]
  num_features <- ncol(datafr) - 1
  num_label <- ncol(datafr)
  label_levels <- levels(datafr[, num_label])
  datafr[, num_features+1] <- ifelse(datafr[, num_label] == names(table(datafr[,num_label]))[1], 0, 1)
  # Predictor variables
  X <- as.matrix(df_train[, 1:num_features])
  X_test <- as.matrix(df_test[, 1:num_features])
  # Add ones to X
  X <- cbind(rep(1, nrow(X)), X)
  X_test <- cbind(rep(1, nrow(X_test)), X_test)
  # Response variable
  Y <- as.matrix(df_train[, num_label] )
  Y <- ifelse(Y == names(table(Y))[1], 0, 1)
  Y_test <- as.matrix(df_test[, num_label] )
  Y_test <- ifelse(Y_test == names(table(Y_test))[1], 0, 1)
  # Intial theta
  initial_theta <- rep(0, ncol(X))
  # Derive theta using gradient descent using optim function
  theta_optim <- optim(par=initial_theta, fn=cost)
  predictions <- ifelse(sigmoid(X_test%*%theta_optim$par)>=0.5, 1, 0)
# Generalization error
error_rate <- sum(predictions!=Y_test)/length(Y_test)
return(error_rate)
}
### Adult Data
data <- read.table('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 
                    sep = ',', fill = F, strip.white = T)
colnames(data) <- c('age', 'workclass', 'fnlwgt', 'education', 
                    'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 
                    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income')
# Featureselection
datafr <- data[, c("age", "education_num", "hours_per_week", "income")]
log_reg(datafr = datafr, m = 20)
 
    