I'm trying to use CountVectorizer() with Pipeline and ColumnTransformer. Because CountVectorizer() produces sparse matrix, I used FunctionTransformer to ensure the ColumnTransformer can hstack correctly when putting together the resulting matrix.
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from typing import Callable
# Dataset
df = pd.DataFrame([['a', 'Hi Tom', 'It is hot', 1],
['b', 'How you been Tom', 'hot coffee', 2],
['c', 'Hi you', 'I want some coffee', 3]],
columns=['col_for_ohe', 'col_for_countvectorizer_1', 'col_for_countvectorizer_2', 'num_col'])
# Use FunctionTransformer to ensure dense matrix
def tf_text(X, vectorizer_tf: Callable):
X_vect_ = vectorizer_tf.fit_transform(X)
return X_vect_.toarray()
tf_transformer = FunctionTransformer(tf_text, kw_args={'vectorizer_tf': CountVectorizer()})
# Transformation Pipelines
tf_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('tf', tf_transformer)])
ohe_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
transformer = ColumnTransformer(transformers=[
('cat_ohe', ohe_transformer_pipe, ['col_for_ohe']),
('cat_tf', tf_transformer_pipe, ['col_for_countvectorizer_1', 'col_for_countvectorizer_2'])
], remainder='passthrough')
transformed_df = transformer.fit_transform(df)
I get AttributeError: 'numpy.ndarray' object has no attribute 'lower.' I've seen this question and suspect CountVectorizer() is the culprit but not sure how to solve it (previous question doesn't use ColumnTransformer). I stumbled upon a DenseTransformer that I wish I could use instead of FunctionTransformer but unfortunately it is not supported in my company.

