I have some machine learning results that I am trying to make sense of. The task is to predict/label "Irish" vs. "non-Irish". Python 2.7's output:
1= ir
0= non-ir
Class count:
0    4090942
1     940852
Name: ethnicity_scan, dtype: int64
Accuracy: 0.874921350119
Classification report:
             precision    recall  f1-score   support
          0       0.89      0.96      0.93   2045610
          1       0.74      0.51      0.60    470287
avg / total       0.87      0.87      0.87   2515897
Confusion matrix:
[[1961422   84188]
 [ 230497  239790]]
AUC-ir= 0.901238104773
As you can see, the precision and recall are mediocre, but the AUC-ROC is higher (~0.90). And I am trying to figure out why, which I suspect is due to data imbalance (about 1:5). Based on the confusion matrix, and using Irish as the target (+), I calculated the TPR=0.51 and FPR=0.04. If I am considering non-Irish as (+), then TPR=0.96 and FPR=0.49. So how can I get a 0.9 AUC while the TPR can be only 0.5 at FPR=0.04?
Codes:
try:
    for i in mass[k]:
        df = df_temp # reset df before each loop
        #$$
        #$$ 
        if 1==1:
        ###if i == singleEthnic:
            count+=1
            ethnicity_tar = str(i) # fr, en, ir, sc, others, ab, rus, ch, it, jp
            # fn, metis, inuit; algonquian, iroquoian, athapaskan, wakashan, siouan, salish, tsimshian, kootenay
            ############################################
            ############################################
            def ethnicity_target(row):
                try:
                    if row[ethnicity_var] == ethnicity_tar:
                        return 1
                    else:
                        return 0
                except: return None
            df['ethnicity_scan'] = df.apply(ethnicity_target, axis=1)
            print '1=', ethnicity_tar
            print '0=', 'non-'+ethnicity_tar
            # Random sampling a smaller dataframe for debugging
            rows = df.sample(n=subsample_size, random_state=seed) # Seed gives fixed randomness
            df = DataFrame(rows)
            print 'Class count:'
            print df['ethnicity_scan'].value_counts()
            # Assign X and y variables
            X = df.raw_name.values
            X2 = df.name.values
            X3 = df.gender.values
            X4 = df.location.values
            y = df.ethnicity_scan.values
            # Feature extraction functions
            def feature_full_name(nameString):
                try:
                    full_name = nameString
                    if len(full_name) > 1: # not accept name with only 1 character
                        return full_name
                    else: return '?'
                except: return '?'
            def feature_full_last_name(nameString):
                try:
                    last_name = nameString.rsplit(None, 1)[-1]
                    if len(last_name) > 1: # not accept name with only 1 character
                        return last_name
                    else: return '?'
                except: return '?'
            def feature_full_first_name(nameString):
                try:
                    first_name = nameString.rsplit(' ', 1)[0]
                    if len(first_name) > 1: # not accept name with only 1 character
                        return first_name
                    else: return '?'
                except: return '?'
            # Transform format of X variables, and spit out a numpy array for all features
            my_dict = [{'last-name': feature_full_last_name(i)} for i in X]
            my_dict5 = [{'first-name': feature_full_first_name(i)} for i in X]
            all_dict = []
            for i in range(0, len(my_dict)):
                temp_dict = dict(
                    my_dict[i].items() + my_dict5[i].items()
                    )
                all_dict.append(temp_dict)
            newX = dv.fit_transform(all_dict)
            # Separate the training and testing data sets
            X_train, X_test, y_train, y_test = cross_validation.train_test_split(newX, y, test_size=testTrainSplit)
            # Fitting X and y into model, using training data
            classifierUsed2.fit(X_train, y_train)
            # Making predictions using trained data
            y_train_predictions = classifierUsed2.predict(X_train)
            y_test_predictions = classifierUsed2.predict(X_test)
Inserted codes for resampling:
try:
    for i in mass[k]:
        df = df_temp # reset df before each loop
        #$$
        #$$ 
        if 1==1:
        ###if i == singleEthnic:
            count+=1
            ethnicity_tar = str(i) # fr, en, ir, sc, others, ab, rus, ch, it, jp
            # fn, metis, inuit; algonquian, iroquoian, athapaskan, wakashan, siouan, salish, tsimshian, kootenay
            ############################################
            ############################################
            def ethnicity_target(row):
                try:
                    if row[ethnicity_var] == ethnicity_tar:
                        return 1
                    else:
                        return 0
                except: return None
            df['ethnicity_scan'] = df.apply(ethnicity_target, axis=1)
            print '1=', ethnicity_tar
            print '0=', 'non-'+ethnicity_tar
            # Resampled
            df_resampled = df.append(df[df.ethnicity_scan==0].sample(len(df)*5, replace=True))
            # Random sampling a smaller dataframe for debugging
            rows = df_resampled.sample(n=subsample_size, random_state=seed) # Seed gives fixed randomness
            df = DataFrame(rows)
            print 'Class count:'
            print df['ethnicity_scan'].value_counts()
            # Assign X and y variables
            X = df.raw_name.values
            X2 = df.name.values
            X3 = df.gender.values
            X4 = df.location.values
            y = df.ethnicity_scan.values
            # Feature extraction functions
            def feature_full_name(nameString):
                try:
                    full_name = nameString
                    if len(full_name) > 1: # not accept name with only 1 character
                        return full_name
                    else: return '?'
                except: return '?'
            def feature_full_last_name(nameString):
                try:
                    last_name = nameString.rsplit(None, 1)[-1]
                    if len(last_name) > 1: # not accept name with only 1 character
                        return last_name
                    else: return '?'
                except: return '?'
            def feature_full_first_name(nameString):
                try:
                    first_name = nameString.rsplit(' ', 1)[0]
                    if len(first_name) > 1: # not accept name with only 1 character
                        return first_name
                    else: return '?'
                except: return '?'
            # Transform format of X variables, and spit out a numpy array for all features
            my_dict = [{'last-name': feature_full_last_name(i)} for i in X]
            my_dict5 = [{'first-name': feature_full_first_name(i)} for i in X]
            all_dict = []
            for i in range(0, len(my_dict)):
                temp_dict = dict(
                    my_dict[i].items() + my_dict5[i].items()
                    )
                all_dict.append(temp_dict)
            newX = dv.fit_transform(all_dict)
            # Separate the training and testing data sets
            X_train, X_test, y_train, y_test = cross_validation.train_test_split(newX, y, test_size=testTrainSplit)
            # Fitting X and y into model, using training data
            classifierUsed2.fit(X_train, y_train)
            # Making predictions using trained data
            y_train_predictions = classifierUsed2.predict(X_train)
            y_test_predictions = classifierUsed2.predict(X_test)