from sklearn.datasets import *


# Boston house prices - 506 points, 13 attributes (none missing)
X, y = load_boston(return_X_y=True)
print(X.shape)

(506, 13)


# Iris plants - 150 points (50/class, 3 classes), 4 attributes (none missing)
data = load_iris()
print(data.target[[10, 25, 50]])
print(list(data.target_names))

[0 0 1]
['setosa', 'versicolor', 'virginica']


# diabetes (regression)
# 442 samples, 10 attributes, integer targets
X,y = load_diabetes(return_X_y=True)
print(X.shape)

(442, 10)


# digits (classification)
X,y = load_digits(return_X_y=True)
print(X.shape,y.shape)

(1797, 64) (1797,)


# linnerud (regression)
X,y = load_linnerud(return_X_y=True)
print(X.shape,y.shape)

(20, 3) (20, 3)


# wine (classification)
X,y = load_wine(return_X_y=True)
print(X.shape,y.shape)

(178, 13) (178,)


# breast cancer (classification)
X,y = load_breast_cancer(return_X_y=True)
print(X.shape,y.shape)

(569, 30) (569,)


# olivetti faces (classification)
faces = fetch_olivetti_faces()
print(faces.data)
print(faces.target)

[[0.30991736 0.3677686  0.41735536 ... 0.15289256 0.16115703 0.1570248 ]
 [0.45454547 0.47107437 0.5123967  ... 0.15289256 0.15289256 0.15289256]
 [0.3181818  0.40082645 0.49173555 ... 0.14049587 0.14876033 0.15289256]
 ...
 [0.5        0.53305787 0.607438   ... 0.17768595 0.14876033 0.19008264]
 [0.21487603 0.21900827 0.21900827 ... 0.57438016 0.59090906 0.60330576]
 [0.5165289  0.46280992 0.28099173 ... 0.35950413 0.3553719  0.38429752]]
[ 0  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1  1  1  2  2  2  2
  2  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4
  4  4  5  5  5  5  5  5  5  5  5  5  6  6  6  6  6  6  6  6  6  6  7  7
  7  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9
  9  9  9  9 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11
 12 12 12 12 12 12 12 12 12 12 13 13 13 13 13 13 13 13 13 13 14 14 14 14
 14 14 14 14 14 14 15 15 15 15 15 15 15 15 15 15 16 16 16 16 16 16 16 16
 16 16 17 17 17 17 17 17 17 17 17 17 18 18 18 18 18 18 18 18 18 18 19 19
 19 19 19 19 19 19 19 19 20 20 20 20 20 20 20 20 20 20 21 21 21 21 21 21
 21 21 21 21 22 22 22 22 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 23
 24 24 24 24 24 24 24 24 24 24 25 25 25 25 25 25 25 25 25 25 26 26 26 26
 26 26 26 26 26 26 27 27 27 27 27 27 27 27 27 27 28 28 28 28 28 28 28 28
 28 28 29 29 29 29 29 29 29 29 29 29 30 30 30 30 30 30 30 30 30 30 31 31
 31 31 31 31 31 31 31 31 32 32 32 32 32 32 32 32 32 32 33 33 33 33 33 33
 33 33 33 33 34 34 34 34 34 34 34 34 34 34 35 35 35 35 35 35 35 35 35 35
 36 36 36 36 36 36 36 36 36 36 37 37 37 37 37 37 37 37 37 37 38 38 38 38
 38 38 38 38 38 38 39 39 39 39 39 39 39 39 39 39]


# 20 newsgroups text corpus (20 classes, 18.8K samples)
data = fetch_20newsgroups(subset='train')
from pprint import pprint
pprint(list(data.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


print(data.filenames.shape)
print(data.target.shape)
print(data.target[:10])

(11314,)
(11314,)
[ 7  4  4  1 14 16 13  3  2  4]


from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors.nnz/float(vectors.shape[0]))

(2034, 34118)
159.0132743362832


from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
vectors_test    = vectorizer.transform(newsgroups_test.data)
clf             = MultinomialNB(alpha=.01).fit(vectors, 
                                               newsgroups_train.target)

pred = clf.predict(vectors_test)
print(metrics.f1_score(newsgroups_test.target, 
                       pred, 
                       average='macro'))

0.8821359240272957


import numpy as np

def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.target_names)

alt.atheism: edu it and in you that is of to the
comp.graphics: edu in graphics it is for and of to the
sci.space: edu it that is in and space to of the
talk.religion.misc: not it you in is that and to of the

/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute coef_ was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).
  warnings.warn(msg, category=FutureWarning)


newsgroups_test_nh = fetch_20newsgroups(subset='test', remove=('headers'),
                                        categories=categories)
newsgroups_test_nf = fetch_20newsgroups(subset='test', remove=('footers'),
                                        categories=categories)
newsgroups_test_nq = fetch_20newsgroups(subset='test', remove=('quotes'),
                                        categories=categories)
newsgroups_test_all = fetch_20newsgroups(subset='test', remove=('headers',
                                                                'footers',
                                                                'quotes'),
                                        categories=categories)

vectors_test_nh = vectorizer.transform(newsgroups_test_nh.data)
vectors_test_nf = vectorizer.transform(newsgroups_test_nf.data)
vectors_test_nq = vectorizer.transform(newsgroups_test_nq.data)
vectors_test_all = vectorizer.transform(newsgroups_test_all.data)

pred_nh = clf.predict(vectors_test_nh)
pred_nf = clf.predict(vectors_test_nf)
pred_nq = clf.predict(vectors_test_nq)
pred_all = clf.predict(vectors_test_all)

print(metrics.f1_score(pred_nh, newsgroups_test_nh.target, average='macro'))
print(metrics.f1_score(pred_nf, newsgroups_test_nf.target, average='macro'))
print(metrics.f1_score(pred_nq, newsgroups_test_nq.target, average='macro'))
print(metrics.f1_score(pred_all, newsgroups_test_all.target, average='macro'))

0.870270086666271
0.8819288688799596
0.8401551094938573
0.7731035068127478


data = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
print(data.target_names.size,"\t",data.target_names[0])
print(data.data.dtype)
print(data.data.shape)
print(data.images.shape)

7 	 Ariel Sharon
float32
(1288, 1850)
(1288, 50, 37)


pairs = fetch_lfw_pairs(subset='train')
print(pairs.target_names)
print(pairs.pairs.shape)
print(pairs.data.shape)
print(pairs.target.shape)

['Different persons' 'Same person']
(2200, 2, 62, 47)
(2200, 5828)
(2200,)


data = fetch_covtype()
print(data.data.shape)
print(data.target.shape)
print(data.data[0])

(581012, 54)
(581012,)
[2.596e+03 5.100e+01 3.000e+00 2.580e+02 0.000e+00 5.100e+02 2.210e+02
 2.320e+02 1.480e+02 6.279e+03 1.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]


data = fetch_rcv1()
print(data.data.shape)
print(data.target.shape)
print(data.sample_id[:3])
print(data.target_names[:10].tolist())

(804414, 47236)
(804414, 103)
[2286 2287 2288]
['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16', 'C17']


data = fetch_kddcup99(subset='smtp')
print(data.data.shape)
print(data.target.shape)
print(data.data[0])

(9571, 3)
(9571,)
[-2.3025850929940455 8.12151008316269 5.796361655949294]


data = fetch_california_housing()
print(data.data.shape)
print(data.target.shape)
print(data.data[0])

(20640, 8)
(20640,)
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]

Dataset loader utilities ¶

Toy Datasets¶

Real World (larger datasets)¶

Olivetti Faces ¶

20 Newsgroups ¶

Labeled Faces in the Wild¶

Forest Covertypes ¶

Reuters Newswire Corpus ¶

KDD CUP (1998 DARPA intrusion detection eval dataset0 ¶

California Housing ¶

Dataset loader utilities¶

Toy Datasets¶

Real World (larger datasets)¶

Olivetti Faces¶

20 Newsgroups¶

Labeled Faces in the Wild¶

Forest Covertypes¶

Reuters Newswire Corpus¶

KDD CUP (1998 DARPA intrusion detection eval dataset0¶

California Housing¶

Dataset loader utilities ¶

Olivetti Faces ¶

20 Newsgroups ¶

Forest Covertypes ¶

Reuters Newswire Corpus ¶

KDD CUP (1998 DARPA intrusion detection eval dataset0 ¶

California Housing ¶