from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',]

X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>


analyze = vectorizer.build_analyzer()

print(analyze("This is a text document to analyze.") == (
    ['this', 'is', 'text', 'document', 'to', 'analyze']))

vectorizer.get_feature_names() == (
    ['and', 'document', 'first', 'is', 'one',
     'second', 'the', 'third', 'this'])

print(X.toarray())

True
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


print(vectorizer.vocabulary_.get('document'))
print(vectorizer.transform(['Something completely new.']).toarray())

1
[[0 0 0 0 0 0 0 0 0]]


bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                    token_pattern=r'\b\w+\b', min_df=1)
analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!') == (
    ['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])

True


X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
print(X_2)

feature_index = bigram_vectorizer.vocabulary_.get('is this')
print(X_2[:, feature_index])

[[0 0 1 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0]
 [0 0 1 0 0 1 1 0 0 2 1 1 1 0 1 0 0 0 1 1 0]
 [1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0]
 [0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1]]
[0 0 0 1]


from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)

counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]

tfidf = transformer.fit_transform(counts)
print(tfidf,"\n\n",tfidf.toarray())

  (0, 2)	0.5732079309279059
  (0, 0)	0.8194099510753754
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 1)	0.8808994832762984
  (4, 0)	0.47330339145578754
  (5, 2)	0.8135516873095774
  (5, 0)	0.5814926070688599 

 [[0.81940995 0.         0.57320793]
 [1.         0.         0.        ]
 [1.         0.         0.        ]
 [1.         0.         0.        ]
 [0.47330339 0.88089948 0.        ]
 [0.58149261 0.         0.81355169]]


transformer = TfidfTransformer()
transformer.fit_transform(counts).toarray()

array([[0.85151335, 0.        , 0.52433293],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.55422893, 0.83236428, 0.        ],
       [0.63035731, 0.        , 0.77630514]])


# model weights of each feature - from fit method
transformer.idf_

array([1.        , 2.25276297, 1.84729786])


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>


from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))

857 documents
2 categories


pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__max_iter': (20,),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 24 candidates, totalling 120 fits
done in 12.465s
Best score: 0.952
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)


import chardet    
text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut"
text2 = b"holdselig sind deine Ger\xfcche"
text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00"
decoded = [x.decode(chardet.detect(x)['encoding'])
           for x in (text1, text2, text3)]        
v = CountVectorizer().fit(decoded).vocabulary_    
for term in v: print(v)

{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}
{'sei': 15, 'mir': 13, 'gegrüßt': 6, 'mein': 12, 'sauerkraut': 14, 'holdselig': 10, 'sind': 16, 'deine': 1, 'gerüche': 7, 'auf': 0, 'flügeln': 4, 'des': 2, 'gesanges': 8, 'herzliebchen': 9, 'trag': 17, 'ich': 11, 'dich': 3, 'fort': 5}


ngram_vectorizer = CountVectorizer(analyzer='char_wb', 
                                   ngram_range=(2, 2))
counts           = ngram_vectorizer.fit_transform(['words', 
                                                   'wprds'])
print(ngram_vectorizer.get_feature_names() == (
    [' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp']))

counts.toarray().astype(int)

True

array([[1, 1, 1, 0, 1, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1]])


ngram_vectorizer = CountVectorizer(analyzer='char_wb', 
                                   ngram_range=(5, 5))
ngram_vectorizer.fit_transform(['jumpy fox'])
print(ngram_vectorizer.get_feature_names() == (
    [' fox ', ' jump', 'jumpy', 'umpy ']),"\n")


ngram_vectorizer = CountVectorizer(analyzer='char', 
                                   ngram_range=(5, 5))
ngram_vectorizer.fit_transform(['jumpy fox'])
print(ngram_vectorizer.get_feature_names() == (
    ['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox']),"\n")

True 

True


from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)
hv.transform(corpus)

<4x10 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>


hv = HashingVectorizer()
hv.transform(corpus)

<4x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>


from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vect = CountVectorizer(tokenizer=LemmaTokenizer())


import re
def to_british(tokens):
    for t in tokens:
        t = re.sub(r"(...)our$", r"\1or", t)
        t = re.sub(r"([bt])re$", r"\1er", t)
        t = re.sub(r"([iy])s(e$|ing|ation)", r"\1z\2", t)
        t = re.sub(r"ogue$", "og", t)
        yield t

class CustomVectorizer(CountVectorizer):
    def build_tokenizer(self):
        tokenize = super().build_tokenizer()
        return lambda doc: list(to_british(tokenize(doc)))

print(CustomVectorizer().build_analyzer()(u"color colour"))

['color', 'color']

Text Feature Extraction ¶

Bag of Words¶

Sparsity¶

Count Vectorizer ¶

Stop Words¶

Tf-Idf Transformer and Vectorizer ¶

Example¶

Tfidf Vectorizer ¶

Binary Occurrences¶

Decoding Text files¶

Bag of Words Limitations¶

Example:¶

The Hashing Trick¶

Out-of-core Scaling with Hashing Vectorizer ¶

Custom Vectorizer Classes¶

Text Feature Extraction¶

Bag of Words¶

Sparsity¶

Count Vectorizer¶

Stop Words¶

Tf-Idf Transformer and Vectorizer¶

Example¶

Tfidf Vectorizer¶

Binary Occurrences¶

Decoding Text files¶

Bag of Words Limitations¶

Example:¶

The Hashing Trick¶

Out-of-core Scaling with Hashing Vectorizer¶

Custom Vectorizer Classes¶

Text Feature Extraction ¶

Count Vectorizer ¶

Tf-Idf Transformer and Vectorizer ¶

Tfidf Vectorizer ¶

Out-of-core Scaling with Hashing Vectorizer ¶