i am trying to use scikit learn 0.17 with anaconda 2.7 for a multilabel classification problem. here is my code
import pandas as pd
import pickle
import re
from sklearn.cross_validation import train_test_split
from sklearn.metrics.metrics import classification_report, accuracy_score, confusion_matrix
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
traindf = pickle.load(open("train.pkl","rb"))
X, y = traindf['colC'], traindf['colB'].as_matrix()
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7)
pip = Pipeline([
('vect', TfidfVectorizer(
                        analyzer='word',
                        binary=False,
                        decode_error='ignore',
                        dtype=<type 'numpy.int64'>,
                        encoding=u'utf-8',
                        input=u'content',
                        lowercase=True,
                        max_df=0.25,
                        max_features=None,
                        min_df=1,
                        ngram_range=(1, 1),
                        norm=u'l2',
                        preprocessor=None,
                        smooth_idf=True,
                        stop_words='english',
                        strip_accents=None,
                        sublinear_tf=True,
                        token_pattern=u'(?u)\\b\\w\\w+\\b',
                        tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'),
                        use_idf=True, vocabulary=None)),
('clf', LogisticRegression(
                        C=10,
                        class_weight=None,
                        dual=False,
                        fit_intercept=True,
                        intercept_scaling=1,
                        max_iter=100,
                        multi_class='multinomial',
                        n_jobs=1,
                        penalty='l2', 
                        random_state=None, 
                        solver='lbfgs',
                        tol=0.0001,
                        verbose=0, 
                        warm_start=False))
                ])
parameters = {}
gridSearchTS = GridSearchCV(pip,parameters,n_jobs=3, verbose=1, scoring='accuracy')
gridSearchTS.fit(Xtrain, ytrain)
predictions = gridSearchTS.predict(Xtest)
print ('Accuracy:', accuracy_score(ytest, predictions))
print ('Confusion Matrix:', confusion_matrix(ytest, predictions))
print ('Classification Report:', classification_report(ytest, predictions))
testdf = pickle.load(open("test.pkl","rb"))
predictions=gridSearchTS.predict(testdf['colC'])
testdf['colB'] = predictions
print(testdf.info())
testdf.to_csv("res.csv")
and here is what my data looks like
training
colC                colB
some text           [list of tags]
some text           [list of tags]
test
colC                    
some text           
some text
but i get the error
raise ValueError('You appear to be using a legacy multi-label data'
ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead.
what does this mean?
here is the full stacktrace
Traceback (most recent call last):
  File "X:\asd.py", line 34, in getTags
    gridSearchTS.fit(Xtrain, ytrain)
  File "X:\popol\Continuum\Anaconda2\lib\site-packages\sklearn\grid_search.py", line 804, in fit
    return self._fit(X, y, ParameterGrid(self.param_grid))
  File "X:\popol\Continuum\Anaconda2\lib\site-packages\sklearn\grid_search.py", line 532, in _fit
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
  File "X:\popol\Continuum\Anaconda2\lib\site-packages\sklearn\cross_validation.py", line 1676, in check_cv
    if type_of_target(y) in ['binary', 'multiclass']:
  File "X:\popol\Continuum\Anaconda2\lib\site-packages\sklearn\utils\multiclass.py", line 251, in type_of_target
    raise ValueError('You appear to be using a legacy multi-label data'
ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead.
how do i fix this? do i need to change the format of my data? why does gridSearchTS.fit(Xtrain, ytrain) fail? how do i make X and y suitable for the fit function?
Edit
I tried
        from sklearn.preprocessing import MultiLabelBinarizer  
        y=MultiLabelBinarizer().fit_transform(y)      
        random_state = np.random.RandomState(0)
        # Split into training and test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                            random_state=random_state)
        # Run classifier
        from sklearn import svm, datasets
        classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                         random_state=random_state))
        y_score = classifier.fit(X_train, y_train).decision_function(X_test)
but now i get
ValueError: could not convert string to float: <value of ColC here>
on
y_score = classifier.fit(X_train, y_train).decision_function(X_test) 
do i have to binarize X as well? why do i need to convert the X dimension to float?
 
    