from sklearn.datasets import *
# Boston house prices - 506 points, 13 attributes (none missing)
X, y = load_boston(return_X_y=True)
print(X.shape)
(506, 13)
# Iris plants - 150 points (50/class, 3 classes), 4 attributes (none missing)
data = load_iris()
print(data.target[[10, 25, 50]])
print(list(data.target_names))
[0 0 1] ['setosa', 'versicolor', 'virginica']
# diabetes (regression)
# 442 samples, 10 attributes, integer targets
X,y = load_diabetes(return_X_y=True)
print(X.shape)
(442, 10)
# digits (classification)
X,y = load_digits(return_X_y=True)
print(X.shape,y.shape)
(1797, 64) (1797,)
# linnerud (regression)
X,y = load_linnerud(return_X_y=True)
print(X.shape,y.shape)
(20, 3) (20, 3)
# wine (classification)
X,y = load_wine(return_X_y=True)
print(X.shape,y.shape)
(178, 13) (178,)
# breast cancer (classification)
X,y = load_breast_cancer(return_X_y=True)
print(X.shape,y.shape)
(569, 30) (569,)
# olivetti faces (classification)
faces = fetch_olivetti_faces()
print(faces.data)
print(faces.target)
[[0.30991736 0.3677686 0.41735536 ... 0.15289256 0.16115703 0.1570248 ] [0.45454547 0.47107437 0.5123967 ... 0.15289256 0.15289256 0.15289256] [0.3181818 0.40082645 0.49173555 ... 0.14049587 0.14876033 0.15289256] ... [0.5 0.53305787 0.607438 ... 0.17768595 0.14876033 0.19008264] [0.21487603 0.21900827 0.21900827 ... 0.57438016 0.59090906 0.60330576] [0.5165289 0.46280992 0.28099173 ... 0.35950413 0.3553719 0.38429752]] [ 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 12 12 12 12 12 12 12 12 12 12 13 13 13 13 13 13 13 13 13 13 14 14 14 14 14 14 14 14 14 14 15 15 15 15 15 15 15 15 15 15 16 16 16 16 16 16 16 16 16 16 17 17 17 17 17 17 17 17 17 17 18 18 18 18 18 18 18 18 18 18 19 19 19 19 19 19 19 19 19 19 20 20 20 20 20 20 20 20 20 20 21 21 21 21 21 21 21 21 21 21 22 22 22 22 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 23 24 24 24 24 24 24 24 24 24 24 25 25 25 25 25 25 25 25 25 25 26 26 26 26 26 26 26 26 26 26 27 27 27 27 27 27 27 27 27 27 28 28 28 28 28 28 28 28 28 28 29 29 29 29 29 29 29 29 29 29 30 30 30 30 30 30 30 30 30 30 31 31 31 31 31 31 31 31 31 31 32 32 32 32 32 32 32 32 32 32 33 33 33 33 33 33 33 33 33 33 34 34 34 34 34 34 34 34 34 34 35 35 35 35 35 35 35 35 35 35 36 36 36 36 36 36 36 36 36 36 37 37 37 37 37 37 37 37 37 37 38 38 38 38 38 38 38 38 38 38 39 39 39 39 39 39 39 39 39 39]
~/scikit_learn_data/20news_home
, then calls load_files on training and/or test data folders# 20 newsgroups text corpus (20 classes, 18.8K samples)
data = fetch_20newsgroups(subset='train')
from pprint import pprint
pprint(list(data.target_names))
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
print(data.filenames.shape)
print(data.target.shape)
print(data.target[:10])
(11314,) (11314,) [ 7 4 4 1 14 16 13 3 2 4]
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc',
'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
print(vectors.shape)
print(vectors.nnz/float(vectors.shape[0]))
(2034, 34118) 159.0132743362832
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
clf = MultinomialNB(alpha=.01).fit(vectors,
newsgroups_train.target)
pred = clf.predict(vectors_test)
print(metrics.f1_score(newsgroups_test.target,
pred,
average='macro'))
0.8821359240272957
import numpy as np
def show_top10(classifier, vectorizer, categories):
feature_names = np.asarray(vectorizer.get_feature_names())
for i, category in enumerate(categories):
top10 = np.argsort(classifier.coef_[i])[-10:]
print("%s: %s" % (category, " ".join(feature_names[top10])))
show_top10(clf, vectorizer, newsgroups_train.target_names)
alt.atheism: edu it and in you that is of to the comp.graphics: edu in graphics it is for and of to the sci.space: edu it that is in and space to of the talk.religion.misc: not it you in is that and to of the
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute coef_ was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). warnings.warn(msg, category=FutureWarning)
remove
to strip out information. (params can be a combination of 'headers','footers','quotes').newsgroups_test_nh = fetch_20newsgroups(subset='test', remove=('headers'),
categories=categories)
newsgroups_test_nf = fetch_20newsgroups(subset='test', remove=('footers'),
categories=categories)
newsgroups_test_nq = fetch_20newsgroups(subset='test', remove=('quotes'),
categories=categories)
newsgroups_test_all = fetch_20newsgroups(subset='test', remove=('headers',
'footers',
'quotes'),
categories=categories)
vectors_test_nh = vectorizer.transform(newsgroups_test_nh.data)
vectors_test_nf = vectorizer.transform(newsgroups_test_nf.data)
vectors_test_nq = vectorizer.transform(newsgroups_test_nq.data)
vectors_test_all = vectorizer.transform(newsgroups_test_all.data)
pred_nh = clf.predict(vectors_test_nh)
pred_nf = clf.predict(vectors_test_nf)
pred_nq = clf.predict(vectors_test_nq)
pred_all = clf.predict(vectors_test_all)
print(metrics.f1_score(pred_nh, newsgroups_test_nh.target, average='macro'))
print(metrics.f1_score(pred_nf, newsgroups_test_nf.target, average='macro'))
print(metrics.f1_score(pred_nq, newsgroups_test_nq.target, average='macro'))
print(metrics.f1_score(pred_all, newsgroups_test_all.target, average='macro'))
0.870270086666271 0.8819288688799596 0.8401551094938573 0.7731035068127478
~/scikit_learn_data/lfw_home/
using joblib
, parses metadata, decodes jpegs, converts slices into memmapped numpy arrays.target
.color=True
.train
,test
and 10_folds
evaluation subsets.data = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
print(data.target_names.size,"\t",data.target_names[0])
print(data.data.dtype)
print(data.data.shape)
print(data.images.shape)
7 Ariel Sharon float32 (1288, 1850) (1288, 50, 37)
pairs = fetch_lfw_pairs(subset='train')
print(pairs.target_names)
print(pairs.pairs.shape)
print(pairs.data.shape)
print(pairs.target.shape)
['Different persons' 'Same person'] (2200, 2, 62, 47) (2200, 5828) (2200,)
data
(a dict-like 'Bunch' object) with feature matrix & target
.as_frame=True
: returns as a Pandas dataframe instead.data = fetch_covtype()
print(data.data.shape)
print(data.target.shape)
print(data.data[0])
(581012, 54) (581012,) [2.596e+03 5.100e+01 3.000e+00 2.580e+02 0.000e+00 5.100e+02 2.210e+02 2.320e+02 1.480e+02 6.279e+03 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
data
is a scipy sparse CSR matrix. Non-zero values are cosine-normalized TF-IDF vectors; first 23K samples = training, last 781K samples = test. Array should have 0.16% non-zero values.target
is a scipy sparse CSR matrix. 804K samples, 103 categories. Each sample has 1 in its category, 0 in the others. 3.1% non-zero.target_names
are the topics of each sample (1 to 17). 103 topics (strings). Corpus frequencies range from 5 ('GMIL') to 381K ('CCAT')data = fetch_rcv1()
print(data.data.shape)
print(data.target.shape)
print(data.sample_id[:3])
print(data.target_names[:10].tolist())
(804414, 47236) (804414, 103) [2286 2287 2288] ['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16', 'C17']
Original dataset has ~80% of abnormal data = unrealistic for anomaly detection, so dataset is splite into SA && SF.
logged_in
= positive (focuses on intrusion attack ~ 0.3%)http
and smtp
are subsets of SF.4.89M samples, 41 dimensions, discrete/continuous data
smtp: 95K samples, 3 dimensions
returns data
feature matrix, target
values. as_frame=True
returns data
as a Pandas dataframe and target
as a Pandas series.
data = fetch_kddcup99(subset='smtp')
print(data.data.shape)
print(data.target.shape)
print(data.data[0])
(9571, 3) (9571,) [-2.3025850929940455 8.12151008316269 5.796361655949294]
data = fetch_california_housing()
print(data.data.shape)
print(data.target.shape)
print(data.data[0])
(20640, 8) (20640,) [ 8.3252 41. 6.98412698 1.02380952 322. 2.55555556 37.88 -122.23 ]