test_filename = "test.csv"
train_filename = "train.csv"
train_news = pd.read_csv(train_filename)
test_news = pd.read_csv(test_filename)
//we will start with simple bag of words technique
//creating feature vector - document term matrix
countV = CountVectorizer()
train_count = countV.fit_transform(train_news["Statement"])
//print training doc term matrix
//we have matrix of size of (10240, 12196) by calling below
train_count.shape
//check vocabulary using below command
print(countV.vocabulary_)
//get feature names
print(countV.get_feature_names()[:25])
//
//tf-idf
tfidfV = TfidfTransformer()
train_tfidf = tfidfV.fit_transform(train_count)
train_tfidf.shape
//get train data feature names
After Change
//we will start with simple bag of words technique
//creating feature vector - document term matrix
countV = CountVectorizer()
train_count = countV.fit_transform(DataPrep.train_news["Statement"].values)
print(countV)
print(train_count)