Abstract: This data set contains the distribution of words in the full text of the NIPS conference papers published from 1987 to 2015.

Data Set Information:

The dataset is in the form of a 11463 x 5812 matrix of word counts, containing 11463 words and 5811 NIPS conference papers (the first column contains the list of words). Each column contains the number of times each word appears in the corresponding document. The names of the columns give information about each document and its timestamp in the following format: Xyear_paperID.

The matrix of word counts was obtained using the R package 'tm” to process the raw .txt files of the full text of the NIPS conference papers published between 1987 and 2015. The document-term matrix was constructed after tokenization, removal of stopwords and truncation of the vocabulary by only keeping words occurring more than 50 times.

Attribute Information:

Column 1: 'X' (list of words) Columns 2-5812: 'Xyear_ID' (timestamp and paper ID)

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook

Criando a transposta do dataset

In [2]:
dataset = pd.read_csv('NIPS_1987-2015.csv')
dataset = dataset.transpose()
new_header = dataset.iloc[0]
dataset = dataset.iloc[1:]
dataset.columns = new_header
dataset.to_csv('NIPS_1987-2015_transpose.csv')
In [3]:
dataset = pd.read_csv('NIPS_1987-2015_transpose.csv')
dataset.head(5)
Out[3]:
Unnamed: 0 abalone abbeel abbott abbreviate abbreviated abc abeles abernethy abilistic ... zhou zhu zien zilberstein zones zoo zoom zou zoubin zurich
0 1987_1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 1987_2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1987_3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 1987_4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 1987_5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 11464 columns

In [4]:
maxx = dataset.max()
maxx = maxx[1:] # Retirando a coluna Xyear_ID
In [13]:
n, bins, patches = plt.hist(maxx, 40, log=True, ec='black', alpha=0.75, align='mid')
plt.title('Histograma BoW', fontsize=20)
plt.xlabel('Maximo encontrado', fontsize=12)
plt.xticks(bins, rotation=90)
plt.ylabel('Frequência', fontsize=12)
ax = plt.gca()
ax.grid(axis='y', linestyle='--', linewidth=1)
plt.show()
print(n, bins)
[  2.32400000e+03   2.40300000e+03   1.28500000e+03   1.09700000e+03
   6.73000000e+02   6.58000000e+02   4.45000000e+02   5.01000000e+02
   3.09000000e+02   3.13000000e+02   3.00000000e+02   1.81000000e+02
   1.84000000e+02   1.29000000e+02   1.49000000e+02   7.40000000e+01
   8.60000000e+01   6.10000000e+01   6.50000000e+01   3.20000000e+01
   3.50000000e+01   3.80000000e+01   2.30000000e+01   2.70000000e+01
   1.00000000e+01   1.80000000e+01   6.00000000e+00   9.00000000e+00
   3.00000000e+00   6.00000000e+00   3.00000000e+00   3.00000000e+00
   4.00000000e+00   2.00000000e+00   3.00000000e+00   0.00000000e+00
   2.00000000e+00   0.00000000e+00   0.00000000e+00   2.00000000e+00] [   1.      5.55   10.1    14.65   19.2    23.75   28.3    32.85   37.4
   41.95   46.5    51.05   55.6    60.15   64.7    69.25   73.8    78.35
   82.9    87.45   92.     96.55  101.1   105.65  110.2   114.75  119.3
  123.85  128.4   132.95  137.5   142.05  146.6   151.15  155.7   160.25
  164.8   169.35  173.9   178.45  183.  ]
In [8]:
sorted_columns = maxx.sort_values(ascending=True)
drop_columns = []

for i in range(len(sorted_columns)):
    drop_columns.append(sorted_columns.axes[0][i])
    
    if sorted_columns[i] > 10:
        break
        
len(drop_columns)
Out[8]:
4728
In [9]:
dataset_old = dataset.copy()
dataset = dataset.drop(drop_columns, axis=1)
In [25]:
from sklearn.decomposition import PCA

X = dataset
X = X.iloc[:, 1:]

pca = PCA(n_components=2).fit(X)
X_PCA = pca.transform(X)
plt.scatter(X_PCA[:,0], X_PCA[:,1])
plt.show()  
In [15]:
pca.explained_variance_ratio_
Out[15]:
array([ 0.03544668,  0.03199579])
In [16]:
from sklearn.decomposition import SparsePCA

X = dataset
X = X.iloc[:, 1:]

spca = SparsePCA(n_components=2).fit(X)
X_SPCA = spca.transform(X)
plt.scatter(X_SPCA[:, 0], X_SPCA[:, 1])
plt.show()
In [54]:
from sklearn.manifold import TSNE

X = dataset
X = X.iloc[:, 1:]
X_tsne = TSNE(n_components=2).fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.show()
Out[54]:
<matplotlib.collections.PathCollection at 0x10afec5f8>
In [55]:
plt.show()
In [13]:
from sklearn.manifold import MDS

X = dataset
X = X.iloc[:, 1:]

X_mds = MDS(n_components=2, verbose=2).fit_transform(X)
plt.scatter(X_mds[:, 0], X_mds[:, 1])
plt.show()
it: 0, stress 531243717615.0
it: 1, stress 97962839765.7
it: 2, stress 88056978464.8
it: 3, stress 82050324014.8
it: 4, stress 78293153859.0
it: 5, stress 75875011232.9
it: 6, stress 74310753079.3
it: 7, stress 73304962083.7
it: 8, stress 72645486264.3
it: 9, stress 72203108110.2
it: 10, stress 71898046782.0
it: 11, stress 71683820500.5
it: 12, stress 71531172134.5
it: 13, stress 71421763307.6
it: 14, stress 71343443922.7
it: 15, stress 71287244376.1
it: 16, stress 71246457615.8
it: 17, stress 71216115876.2
it: 18, stress 71192998243.3
it: 19, stress 71174907907.4
it: 20, stress 71160396585.0
it: 21, stress 71148598673.0
it: 22, stress 71138916145.0
it: 23, stress 71130879736.8
it: 24, stress 71124201542.2
it: 25, stress 71118637224.0
it: 26, stress 71113993514.9
it: 27, stress 71110119405.6
it: 28, stress 71106917314.9
it: 29, stress 71104250629.9
it: 30, stress 71102018174.6
it: 31, stress 71100145871.8
it: 32, stress 71098568149.2
it: 33, stress 71097215013.1
it: 34, stress 71096043310.5
it: 35, stress 71095021746.7
it: 36, stress 71094114726.8
it: 37, stress 71093298385.4
it: 38, stress 71092552288.0
it: 39, stress 71091863475.2
it: 40, stress 71091220543.2
it: 41, stress 71090615643.3
it: 42, stress 71090042452.9
it: 43, stress 71089494791.5
it: 44, stress 71088966927.9
it: 45, stress 71088455568.6
it: 46, stress 71087958105.7
it: 47, stress 71087471687.1
it: 48, stress 71086994169.2
it: 49, stress 71086523955.7
it: 50, stress 71086059607.5
it: 51, stress 71085599242.1
it: 52, stress 71085141230.9
it: 53, stress 71084684726.3
it: 54, stress 71084228623.0
it: 55, stress 71083772886.7
it: 56, stress 71083316693.7
it: 57, stress 71082859106.4
it: 58, stress 71082398857.7
it: 59, stress 71081935709.2
it: 60, stress 71081469060.3
it: 61, stress 71080998242.1
it: 62, stress 71080523416.1
it: 63, stress 71080044371.0
it: 64, stress 71079560953.8
it: 65, stress 71079072922.4
it: 66, stress 71078580032.6
it: 67, stress 71078082535.8
it: 68, stress 71077580292.0
it: 69, stress 71077072805.0
it: 70, stress 71076559784.9
it: 71, stress 71076041038.9
it: 72, stress 71075515275.7
it: 73, stress 71074982476.9
it: 74, stress 71074443349.0
it: 75, stress 71073897763.1
it: 76, stress 71073345551.5
it: 77, stress 71072786680.9
it: 78, stress 71072221068.8
it: 79, stress 71071648511.9
it: 80, stress 71071069059.6
it: 81, stress 71070481994.2
it: 82, stress 71069886546.8
it: 83, stress 71069282064.3
it: 84, stress 71068667588.5
it: 85, stress 71068042299.1
it: 86, stress 71067406391.6
it: 87, stress 71066760277.9
it: 88, stress 71066103912.7
it: 89, stress 71065437454.3
it: 90, stress 71064761102.3
it: 91, stress 71064073943.3
it: 92, stress 71063375878.3
it: 93, stress 71062667183.6
it: 94, stress 71061947600.7
it: 95, stress 71061217554.4
it: 96, stress 71060476502.1
it: 97, stress 71059723807.6
it: 98, stress 71058958728.5
it: 99, stress 71058179955.2
it: 100, stress 71057386935.5
it: 101, stress 71056579771.5
it: 102, stress 71055758478.0
it: 103, stress 71054922563.2
it: 104, stress 71054071370.5
it: 105, stress 71053204049.8
it: 106, stress 71052320077.0
it: 107, stress 71051419945.4
it: 108, stress 71050504437.0
it: 109, stress 71049573611.7
it: 110, stress 71048626185.0
it: 111, stress 71047661571.4
it: 112, stress 71046679289.8
it: 113, stress 71045679008.7
it: 114, stress 71044660828.5
it: 115, stress 71043625231.7
it: 116, stress 71042571803.7
it: 117, stress 71041497466.3
it: 118, stress 71040400611.0
it: 119, stress 71039282635.0
it: 120, stress 71038143984.3
it: 121, stress 71036982209.7
it: 122, stress 71035795950.4
it: 123, stress 71034587622.4
it: 124, stress 71033356618.3
it: 125, stress 71032103647.5
it: 126, stress 71030829080.7
it: 127, stress 71029532397.8
it: 128, stress 71028214842.7
it: 129, stress 71026874788.8
it: 130, stress 71025511250.2
it: 131, stress 71024124908.7
it: 132, stress 71022714629.5
it: 133, stress 71021279642.1
it: 134, stress 71019819226.5
it: 135, stress 71018335029.3
it: 136, stress 71016826573.5
it: 137, stress 71015290817.1
it: 138, stress 71013729605.7
it: 139, stress 71012143703.1
it: 140, stress 71010531884.7
it: 141, stress 71008896672.1
it: 142, stress 71007236876.8
it: 143, stress 71005552873.7
it: 144, stress 71003843004.1
it: 145, stress 71002105641.0
it: 146, stress 71000340447.8
it: 147, stress 70998544924.1
it: 148, stress 70996720216.6
it: 149, stress 70994864905.8
it: 150, stress 70992977835.2
it: 151, stress 70991057083.1
it: 152, stress 70989105047.8
it: 153, stress 70987122130.6
it: 154, stress 70985105894.0
it: 155, stress 70983055335.8
it: 156, stress 70980969098.6
it: 157, stress 70978847721.8
it: 158, stress 70976691194.3
it: 159, stress 70974497000.2
it: 160, stress 70972264055.9
it: 161, stress 70969991100.1
it: 162, stress 70967679503.4
it: 163, stress 70965327653.5
it: 164, stress 70962936421.0
it: 165, stress 70960508579.0
it: 166, stress 70958043474.8
it: 167, stress 70955540233.1
it: 168, stress 70952995786.6
it: 169, stress 70950409560.3
it: 170, stress 70947781789.0
it: 171, stress 70945111499.6
it: 172, stress 70942392815.5
it: 173, stress 70939624133.9
it: 174, stress 70936806194.2
it: 175, stress 70933942783.2
it: 176, stress 70931029092.2
it: 177, stress 70928069643.6
it: 178, stress 70925065315.8
it: 179, stress 70922017788.5
it: 180, stress 70918926091.1
it: 181, stress 70915785812.8
it: 182, stress 70912600007.2
it: 183, stress 70909367829.5
it: 184, stress 70906087290.2
it: 185, stress 70902755356.3
it: 186, stress 70899379233.7
it: 187, stress 70895958728.2
it: 188, stress 70892492605.1
it: 189, stress 70888979529.4
it: 190, stress 70885416265.2
it: 191, stress 70881796236.1
it: 192, stress 70878123185.0
it: 193, stress 70874392207.2
it: 194, stress 70870606579.5
it: 195, stress 70866765106.0
it: 196, stress 70862866998.7
it: 197, stress 70858913738.0
it: 198, stress 70854908108.2
it: 199, stress 70850848746.9
it: 200, stress 70846733259.5
it: 201, stress 70842565188.5
it: 202, stress 70838342127.9
it: 203, stress 70834059226.5
it: 204, stress 70829722409.9
it: 205, stress 70825324894.2
it: 206, stress 70820869769.6
it: 207, stress 70816357904.7
it: 208, stress 70811790412.4
it: 209, stress 70807165302.3
it: 210, stress 70802476790.6
it: 211, stress 70797728419.5
it: 212, stress 70792928224.9
it: 213, stress 70788072922.8
it: 214, stress 70783165414.8
it: 215, stress 70778208125.8
it: 216, stress 70773195541.0
it: 217, stress 70768125145.9
it: 218, stress 70762989030.1
it: 219, stress 70757797913.5
it: 220, stress 70752559430.4
it: 221, stress 70747273212.9
it: 222, stress 70741927024.2
it: 223, stress 70736521465.5
it: 224, stress 70731054371.1
it: 225, stress 70725520742.1
it: 226, stress 70719916350.4
it: 227, stress 70714233195.7
it: 228, stress 70708476071.9
it: 229, stress 70702657041.3
it: 230, stress 70696781387.4
it: 231, stress 70690849804.9
it: 232, stress 70684861084.3
it: 233, stress 70678806649.3
it: 234, stress 70672699024.7
it: 235, stress 70666526341.8
it: 236, stress 70660302506.2
it: 237, stress 70654026940.3
it: 238, stress 70647696741.8
it: 239, stress 70641309379.6
it: 240, stress 70634873073.4
it: 241, stress 70628390877.0
it: 242, stress 70621859211.0
it: 243, stress 70615273466.1
it: 244, stress 70608641785.5
it: 245, stress 70601969797.8
it: 246, stress 70595248648.6
it: 247, stress 70588476466.0
it: 248, stress 70581646726.4
it: 249, stress 70574754080.2
it: 250, stress 70567803001.9
it: 251, stress 70560796609.5
it: 252, stress 70553733969.2
it: 253, stress 70546620732.7
it: 254, stress 70539451010.1
it: 255, stress 70532218697.4
it: 256, stress 70524925904.6
it: 257, stress 70517577120.1
it: 258, stress 70510184782.3
it: 259, stress 70502743791.5
it: 260, stress 70495262626.3
it: 261, stress 70487740969.0
it: 262, stress 70480177476.4
it: 263, stress 70472551522.4
it: 264, stress 70464879982.3
it: 265, stress 70457160432.9
it: 266, stress 70449391780.6
it: 267, stress 70441575180.3
it: 268, stress 70433716938.9
it: 269, stress 70425818497.2
it: 270, stress 70417866070.1
it: 271, stress 70409850703.1
it: 272, stress 70401779342.3
it: 273, stress 70393647186.0
it: 274, stress 70385465146.0
it: 275, stress 70377230730.7
it: 276, stress 70368949594.2
it: 277, stress 70360628055.5
it: 278, stress 70352257948.2
it: 279, stress 70343821670.5
it: 280, stress 70335325034.4
it: 281, stress 70326773069.8
it: 282, stress 70318176395.5
it: 283, stress 70309539710.9
it: 284, stress 70300849656.1
it: 285, stress 70292117455.7
it: 286, stress 70283354144.1
it: 287, stress 70274559804.7
it: 288, stress 70265736850.4
it: 289, stress 70256866501.1
it: 290, stress 70247961018.9
it: 291, stress 70239027079.1
it: 292, stress 70230067780.8
it: 293, stress 70221069363.1
it: 294, stress 70212032807.0
it: 295, stress 70202958759.4
it: 296, stress 70193836141.1
it: 297, stress 70184662264.2
it: 298, stress 70175429610.7
it: 299, stress 70166148454.1
it: 0, stress 531247381558.0
it: 1, stress 98298427768.5
it: 2, stress 88308179200.4
it: 3, stress 82189728271.2
it: 4, stress 78437516421.0
it: 5, stress 76001806557.5
it: 6, stress 74406557826.4
it: 7, stress 73354298741.9
it: 8, stress 72664176821.4
it: 9, stress 72206200617.0
it: 10, stress 71889215493.6
it: 11, stress 71666697197.9
it: 12, stress 71512039796.7
it: 13, stress 71405251210.9
it: 14, stress 71330060996.9
it: 15, stress 71275894862.1
it: 16, stress 71236374054.1
it: 17, stress 71207220863.5
it: 18, stress 71185540731.8
it: 19, stress 71169200895.0
it: 20, stress 71156709862.0
it: 21, stress 71146992800.4
it: 22, stress 71139282657.0
it: 23, stress 71133067213.1
it: 24, stress 71128017181.7
it: 25, stress 71123836086.0
it: 26, stress 71120315617.0
it: 27, stress 71117300199.4
it: 28, stress 71114706808.8
it: 29, stress 71112460626.4
it: 30, stress 71110504974.2
it: 31, stress 71108804421.4
it: 32, stress 71107320972.0
it: 33, stress 71106019256.4
it: 34, stress 71104869097.3
it: 35, stress 71103853895.7
it: 36, stress 71102951407.3
it: 37, stress 71102146373.4
it: 38, stress 71101425727.8
it: 39, stress 71100779896.0
it: 40, stress 71100195407.5
it: 41, stress 71099663854.1
it: 42, stress 71099175519.4
it: 43, stress 71098723681.8
it: 44, stress 71098303034.2
it: 45, stress 71097909979.0
it: 46, stress 71097539728.9
it: 47, stress 71097189957.7
it: 48, stress 71096857396.0
it: 49, stress 71096538732.6
it: 50, stress 71096232642.3
it: 51, stress 71095937637.9
it: 52, stress 71095652286.6
it: 53, stress 71095375360.3
it: 54, stress 71095105979.3
it: 55, stress 71094843288.9
it: 56, stress 71094586537.4
it: 57, stress 71094335072.8
it: 58, stress 71094088195.1
it: 59, stress 71093845729.2
it: 60, stress 71093607315.0
it: 61, stress 71093372515.1
it: 62, stress 71093140923.8
it: 63, stress 71092912109.2
it: 64, stress 71092685643.6
it: 65, stress 71092461251.4
it: 66, stress 71092238539.6
it: 67, stress 71092017325.9
it: 68, stress 71091797513.1
it: 69, stress 71091579021.6
it: 70, stress 71091361697.9
it: 71, stress 71091145389.1
it: 72, stress 71090930002.4
it: 73, stress 71090715412.9
it: 74, stress 71090501776.0
it: 75, stress 71090289031.7
it: 76, stress 71090077042.7
it: 77, stress 71089865662.6
it: 78, stress 71089654783.4
it: 79, stress 71089444281.6
it: 80, stress 71089234167.9
it: 81, stress 71089024483.1
it: 82, stress 71088815206.2
it: 83, stress 71088606240.5
it: 84, stress 71088397392.8
it: 85, stress 71088188679.3
it: 86, stress 71087980052.3
it: 87, stress 71087771330.2
it: 88, stress 71087562691.8
it: 89, stress 71087354208.2
it: 90, stress 71087145891.4
it: 91, stress 71086937680.2
it: 92, stress 71086729466.5
it: 93, stress 71086521054.3
it: 94, stress 71086312166.9
it: 95, stress 71086102942.7
it: 96, stress 71085893414.8
it: 97, stress 71085683442.5
it: 98, stress 71085472829.3
it: 99, stress 71085261518.2
it: 100, stress 71085049524.2
it: 101, stress 71084836759.6
it: 102, stress 71084623007.7
it: 103, stress 71084408172.3
it: 104, stress 71084192098.4
it: 105, stress 71083974933.3
it: 106, stress 71083756594.8
it: 107, stress 71083537036.3
it: 108, stress 71083316244.0
it: 109, stress 71083094204.1
it: 110, stress 71082870790.5
it: 111, stress 71082645785.9
it: 112, stress 71082419165.7
it: 113, stress 71082191041.1
it: 114, stress 71081961409.8
it: 115, stress 71081730238.0
it: 116, stress 71081497756.4
it: 117, stress 71081263960.8
it: 118, stress 71081028698.9
it: 119, stress 71080791622.6
it: 120, stress 71080552696.1
it: 121, stress 71080311869.9
it: 122, stress 71080069084.1
it: 123, stress 71079824675.6
it: 124, stress 71079578721.8
it: 125, stress 71079331170.2
it: 126, stress 71079082062.3
it: 127, stress 71078831322.5
it: 128, stress 71078578980.0
it: 129, stress 71078325243.8
it: 130, stress 71078070187.6
it: 131, stress 71077813878.0
it: 132, stress 71077556154.4
it: 133, stress 71077296757.4
it: 134, stress 71077035801.6
it: 135, stress 71076772969.0
it: 136, stress 71076507970.8
it: 137, stress 71076240739.1
it: 138, stress 71075971472.0
it: 139, stress 71075699933.9
it: 140, stress 71075426240.5
it: 141, stress 71075150594.3
it: 142, stress 71074872887.9
it: 143, stress 71074592748.5
it: 144, stress 71074310396.3
it: 145, stress 71074025578.3
it: 146, stress 71073738206.9
it: 147, stress 71073448742.1
it: 148, stress 71073157064.2
it: 149, stress 71072863196.7
it: 150, stress 71072566987.5
it: 151, stress 71072268259.9
it: 152, stress 71071966884.5
it: 153, stress 71071662373.0
it: 154, stress 71071355186.5
it: 155, stress 71071045272.1
it: 156, stress 71070732920.6
it: 157, stress 71070417903.7
it: 158, stress 71070100387.6
it: 159, stress 71069780350.0
it: 160, stress 71069457207.0
it: 161, stress 71069130667.6
it: 162, stress 71068801039.1
it: 163, stress 71068468181.6
it: 164, stress 71068131878.2
it: 165, stress 71067791963.7
it: 166, stress 71067448502.4
it: 167, stress 71067101528.7
it: 168, stress 71066750842.3
it: 169, stress 71066396468.3
it: 170, stress 71066039010.3
it: 171, stress 71065678687.1
it: 172, stress 71065315168.8
it: 173, stress 71064948145.9
it: 174, stress 71064577504.2
it: 175, stress 71064203284.9
it: 176, stress 71063825384.8
it: 177, stress 71063443691.6
it: 178, stress 71063058350.8
it: 179, stress 71062669319.8
it: 180, stress 71062276650.9
it: 181, stress 71061880272.5
it: 182, stress 71061479969.2
it: 183, stress 71061075670.9
it: 184, stress 71060667301.0
it: 185, stress 71060254961.6
it: 186, stress 71059838646.9
it: 187, stress 71059418113.4
it: 188, stress 71058992897.2
it: 189, stress 71058563499.2
it: 190, stress 71058129662.4
it: 191, stress 71057691045.3
it: 192, stress 71057247250.7
it: 193, stress 71056799006.2
it: 194, stress 71056346309.7
it: 195, stress 71055889228.1
it: 196, stress 71055427605.5
it: 197, stress 71054960914.8
it: 198, stress 71054489054.3
it: 199, stress 71054012232.0
it: 200, stress 71053530074.5
it: 201, stress 71053042545.4
it: 202, stress 71052549632.4
it: 203, stress 71052051207.8
it: 204, stress 71051547094.9
it: 205, stress 71051037107.8
it: 206, stress 71050521292.3
it: 207, stress 71049999921.1
it: 208, stress 71049472801.8
it: 209, stress 71048939451.2
it: 210, stress 71048399783.8
it: 211, stress 71047853541.0
it: 212, stress 71047300496.6
it: 213, stress 71046740428.5
it: 214, stress 71046173253.0
it: 215, stress 71045599051.3
it: 216, stress 71045017987.7
it: 217, stress 71044429803.8
it: 218, stress 71043833431.1
it: 219, stress 71043229487.8
it: 220, stress 71042618596.6
it: 221, stress 71042001076.1
it: 222, stress 71041374915.3
it: 223, stress 71040739941.2
it: 224, stress 71040097017.1
it: 225, stress 71039445996.2
it: 226, stress 71038787359.3
it: 227, stress 71038121355.1
it: 228, stress 71037447772.9
it: 229, stress 71036767063.9
it: 230, stress 71036079567.1
it: 231, stress 71035384013.0
it: 232, stress 71034680119.2
it: 233, stress 71033968432.7
it: 234, stress 71033248631.4
it: 235, stress 71032520357.8
it: 236, stress 71031784452.5
it: 237, stress 71031040525.7
it: 238, stress 71030288127.8
it: 239, stress 71029527437.8
it: 240, stress 71028758374.9
it: 241, stress 71027979589.3
it: 242, stress 71027190572.6
it: 243, stress 71026392454.3
it: 244, stress 71025585145.6
it: 245, stress 71024768722.1
it: 246, stress 71023943352.1
it: 247, stress 71023108509.9
it: 248, stress 71022262335.6
it: 249, stress 71021404563.4
it: 250, stress 71020535360.8
it: 251, stress 71019656149.6
it: 252, stress 71018766210.4
it: 253, stress 71017864205.7
it: 254, stress 71016951640.2
it: 255, stress 71016028894.3
it: 256, stress 71015094648.1
it: 257, stress 71014148648.5
it: 258, stress 71013190225.2
it: 259, stress 71012218604.6
it: 260, stress 71011234573.9
it: 261, stress 71010237409.7
it: 262, stress 71009226393.4
it: 263, stress 71008200986.6
it: 264, stress 71007161390.7
it: 265, stress 71006107791.7
it: 266, stress 71005039207.0
it: 267, stress 71003954369.4
it: 268, stress 71002854510.6
it: 269, stress 71001740890.3
it: 270, stress 71000612400.1
it: 271, stress 70999469044.6
it: 272, stress 70998309766.8
it: 273, stress 70997135898.6
it: 274, stress 70995947534.1
it: 275, stress 70994745242.7
it: 276, stress 70993528161.1
it: 277, stress 70992295691.3
it: 278, stress 70991047880.4
it: 279, stress 70989784266.4
it: 280, stress 70988504865.6
it: 281, stress 70987209583.1
it: 282, stress 70985897592.6
it: 283, stress 70984567047.6
it: 284, stress 70983217939.7
it: 285, stress 70981849471.6
it: 286, stress 70980463155.5
it: 287, stress 70979059490.6
it: 288, stress 70977637888.2
it: 289, stress 70976196230.1
it: 290, stress 70974736877.9
it: 291, stress 70973261561.1
it: 292, stress 70971767971.1
it: 293, stress 70970254024.2
it: 294, stress 70968720058.7
it: 295, stress 70967165223.1
it: 296, stress 70965588581.5
it: 297, stress 70963991835.5
it: 298, stress 70962374364.7
it: 299, stress 70960736746.6
it: 0, stress 531227342085.0
it: 1, stress 97638040944.2
it: 2, stress 87709422372.2
it: 3, stress 81634556395.4
it: 4, stress 77910179247.5
it: 5, stress 75617790158.5
it: 6, stress 74182901922.1
it: 7, stress 73267783621.7
it: 8, stress 72659636051.6
it: 9, stress 72233584005.7
it: 10, stress 71925265746.9
it: 11, stress 71701039779.4
it: 12, stress 71539891096.0
it: 13, stress 71425125642.9
it: 14, stress 71343392457.7
it: 15, stress 71284863319.0
it: 16, stress 71242248232.5
it: 17, stress 71210518764.0
it: 18, stress 71186367954.7
it: 19, stress 71167689893.7
it: 20, stress 71153073381.7
it: 21, stress 71141638944.4
it: 22, stress 71132707934.3
it: 23, stress 71125728004.7
it: 24, stress 71120273803.3
it: 25, stress 71115986179.2
it: 26, stress 71112556305.2
it: 27, stress 71109752582.1
it: 28, stress 71107423381.9
it: 29, stress 71105458035.4
it: 30, stress 71103769086.9
it: 31, stress 71102291196.9
it: 32, stress 71100982897.4
it: 33, stress 71099818149.9
it: 34, stress 71098774137.5
it: 35, stress 71097831170.6
it: 36, stress 71096977614.9
it: 37, stress 71096202336.6
it: 38, stress 71095494611.8
it: 39, stress 71094844659.0
it: 40, stress 71094246548.6
it: 41, stress 71093694648.3
it: 42, stress 71093184669.1
it: 43, stress 71092709989.2
it: 44, stress 71092266411.1
it: 45, stress 71091849666.1
it: 46, stress 71091456169.5
it: 47, stress 71091082163.3
it: 48, stress 71090724883.0
it: 49, stress 71090381688.9
it: 50, stress 71090050178.9
it: 51, stress 71089729283.6
it: 52, stress 71089417297.2
it: 53, stress 71089111955.9
it: 54, stress 71088812711.2
it: 55, stress 71088518475.1
it: 56, stress 71088228323.0
it: 57, stress 71087942001.9
it: 58, stress 71087658937.5
it: 59, stress 71087378813.1
it: 60, stress 71087101100.0
it: 61, stress 71086825262.4
it: 62, stress 71086550919.0
it: 63, stress 71086277676.1
it: 64, stress 71086005160.9
it: 65, stress 71085733154.8
it: 66, stress 71085461420.5
it: 67, stress 71085189710.7
it: 68, stress 71084917709.3
it: 69, stress 71084645005.7
it: 70, stress 71084371617.5
it: 71, stress 71084097665.3
it: 72, stress 71083822944.7
it: 73, stress 71083547162.3
it: 74, stress 71083270247.6
it: 75, stress 71082992115.1
it: 76, stress 71082712639.0
it: 77, stress 71082431775.4
it: 78, stress 71082149454.4
it: 79, stress 71081865217.1
it: 80, stress 71081578821.6
it: 81, stress 71081290355.5
it: 82, stress 71080999902.3
it: 83, stress 71080707524.5
it: 84, stress 71080412949.5
it: 85, stress 71080116118.4
it: 86, stress 71079816728.6
it: 87, stress 71079514959.4
it: 88, stress 71079210973.9
it: 89, stress 71078904680.3
it: 90, stress 71078596205.3
it: 91, stress 71078285489.3
it: 92, stress 71077972290.9
it: 93, stress 71077656439.3
it: 94, stress 71077338060.5
it: 95, stress 71077017165.0
it: 96, stress 71076693444.9
it: 97, stress 71076366881.3
it: 98, stress 71076037577.2
it: 99, stress 71075705369.4
it: 100, stress 71075370145.2
it: 101, stress 71075031872.0
it: 102, stress 71074690313.8
it: 103, stress 71074345282.1
it: 104, stress 71073996697.6
it: 105, stress 71073644344.9
it: 106, stress 71073287943.9
it: 107, stress 71072927684.8
it: 108, stress 71072563764.4
it: 109, stress 71072195805.7
it: 110, stress 71071823664.8
it: 111, stress 71071447648.8
it: 112, stress 71071067873.1
it: 113, stress 71070684556.1
it: 114, stress 71070297721.3
it: 115, stress 71069907418.4
it: 116, stress 71069512956.3
it: 117, stress 71069114151.3
it: 118, stress 71068710810.9
it: 119, stress 71068302673.0
it: 120, stress 71067889585.1
it: 121, stress 71067471548.8
it: 122, stress 71067048804.2
it: 123, stress 71066621637.9
it: 124, stress 71066189936.1
it: 125, stress 71065752863.9
it: 126, stress 71065310667.8
it: 127, stress 71064863274.1
it: 128, stress 71064410396.8
it: 129, stress 71063951904.7
it: 130, stress 71063488072.0
it: 131, stress 71063017821.8
it: 132, stress 71062540290.2
it: 133, stress 71062055799.6
it: 134, stress 71061564940.7
it: 135, stress 71061067857.7
it: 136, stress 71060564818.6
it: 137, stress 71060055359.8
it: 138, stress 71059539101.5
it: 139, stress 71059015687.0
it: 140, stress 71058485680.0
it: 141, stress 71057949303.6
it: 142, stress 71057406620.8
it: 143, stress 71056857469.6
it: 144, stress 71056301518.4
it: 145, stress 71055738501.0
it: 146, stress 71055167941.3
it: 147, stress 71054589566.0
it: 148, stress 71054003567.4
it: 149, stress 71053409766.6
it: 150, stress 71052809287.2
it: 151, stress 71052201671.4
it: 152, stress 71051586549.1
it: 153, stress 71050964534.2
it: 154, stress 71050335109.5
it: 155, stress 71049697619.5
it: 156, stress 71049051039.0
it: 157, stress 71048395607.0
it: 158, stress 71047731598.2
it: 159, stress 71047059085.5
it: 160, stress 71046377787.1
it: 161, stress 71045687062.4
it: 162, stress 71044986357.1
it: 163, stress 71044275663.8
it: 164, stress 71043554425.1
it: 165, stress 71042822138.8
it: 166, stress 71042079028.5
it: 167, stress 71041324434.7
it: 168, stress 71040558313.9
it: 169, stress 71039780258.2
it: 170, stress 71038990801.6
it: 171, stress 71038189874.9
it: 172, stress 71037379014.6
it: 173, stress 71036558113.3
it: 174, stress 71035726420.0
it: 175, stress 71034883669.6
it: 176, stress 71034029267.0
it: 177, stress 71033162394.2
it: 178, stress 71032283945.9
it: 179, stress 71031395280.4
it: 180, stress 71030495196.0
it: 181, stress 71029584609.9
it: 182, stress 71028663881.8
it: 183, stress 71027734299.4
it: 184, stress 71026795093.2
it: 185, stress 71025845821.1
it: 186, stress 71024884874.6
it: 187, stress 71023910939.1
it: 188, stress 71022922921.5
it: 189, stress 71021922081.1
it: 190, stress 71020908313.7
it: 191, stress 71019879323.4
it: 192, stress 71018835206.2
it: 193, stress 71017775480.0
it: 194, stress 71016701219.6
it: 195, stress 71015612322.4
it: 196, stress 71014508329.9
it: 197, stress 71013387190.2
it: 198, stress 71012248189.4
it: 199, stress 71011091454.7
it: 200, stress 71009916881.7
it: 201, stress 71008725616.5
it: 202, stress 71007516848.8
it: 203, stress 71006288570.0
it: 204, stress 71005041520.8
it: 205, stress 71003776019.6
it: 206, stress 71002494786.0
it: 207, stress 71001196718.2
it: 208, stress 70999879991.1
it: 209, stress 70998546436.6
it: 210, stress 70997195010.5
it: 211, stress 70995825454.8
it: 212, stress 70994436484.9
it: 213, stress 70993027783.3
it: 214, stress 70991598875.0
it: 215, stress 70990148778.6
it: 216, stress 70988676216.6
it: 217, stress 70987182407.3
it: 218, stress 70985665699.0
it: 219, stress 70984128144.5
it: 220, stress 70982569627.6
it: 221, stress 70980989279.7
it: 222, stress 70979386804.8
it: 223, stress 70977762099.6
it: 224, stress 70976114809.6
it: 225, stress 70974443541.7
it: 226, stress 70972749460.6
it: 227, stress 70971029419.9
it: 228, stress 70969284622.4
it: 229, stress 70967517193.6
it: 230, stress 70965728104.2
it: 231, stress 70963916102.2
it: 232, stress 70962080358.0
it: 233, stress 70960218039.9
it: 234, stress 70958329535.6
it: 235, stress 70956414376.6
it: 236, stress 70954473882.7
it: 237, stress 70952507393.3
it: 238, stress 70950513425.9
it: 239, stress 70948494542.8
it: 240, stress 70946451868.1
it: 241, stress 70944384367.5
it: 242, stress 70942293171.1
it: 243, stress 70940178358.4
it: 244, stress 70938037676.0
it: 245, stress 70935870059.0
it: 246, stress 70933672672.2
it: 247, stress 70931444890.5
it: 248, stress 70929188428.5
it: 249, stress 70926905289.4
it: 250, stress 70924591890.2
it: 251, stress 70922244303.4
it: 252, stress 70919862205.2
it: 253, stress 70917445525.4
it: 254, stress 70914995321.3
it: 255, stress 70912514198.0
it: 256, stress 70910005571.4
it: 257, stress 70907466399.0
it: 258, stress 70904894191.9
it: 259, stress 70902287556.6
it: 260, stress 70899645265.6
it: 261, stress 70896970684.7
it: 262, stress 70894261468.8
it: 263, stress 70891518054.4
it: 264, stress 70888736817.2
it: 265, stress 70885915006.6
it: 266, stress 70883055421.0
it: 267, stress 70880157303.1
it: 268, stress 70877220640.0
it: 269, stress 70874246687.1
it: 270, stress 70871239912.7
it: 271, stress 70868198171.1
it: 272, stress 70865121243.6
it: 273, stress 70862004483.1
it: 274, stress 70858846392.9
it: 275, stress 70855647066.4
it: 276, stress 70852411805.3
it: 277, stress 70849141954.6
it: 278, stress 70845834591.7
it: 279, stress 70842490980.4
it: 280, stress 70839111638.6
it: 281, stress 70835697045.7
it: 282, stress 70832244058.6
it: 283, stress 70828744695.4
it: 284, stress 70825202109.3
it: 285, stress 70821611637.5
it: 286, stress 70817976440.2
it: 287, stress 70814298793.2
it: 288, stress 70810577658.0
it: 289, stress 70806811848.0
it: 290, stress 70802998728.2
it: 291, stress 70799136836.2
it: 292, stress 70795224227.9
it: 293, stress 70791258384.8
it: 294, stress 70787240164.5
it: 295, stress 70783169900.4
it: 296, stress 70779049712.9
it: 297, stress 70774882964.6
it: 298, stress 70770670267.1
it: 299, stress 70766412401.2
it: 0, stress 531244818648.0
it: 1, stress 98132531162.4
it: 2, stress 88119774442.7
it: 3, stress 81912158682.4
it: 4, stress 78039549027.2
it: 5, stress 75645004577.5
it: 6, stress 74124675038.5
it: 7, stress 73145394679.7
it: 8, stress 72515913327.3
it: 9, stress 72113875780.2
it: 10, stress 71847586312.7
it: 11, stress 71661495002.1
it: 12, stress 71526110800.9
it: 13, stress 71426069989.9
it: 14, stress 71351861694.6
it: 15, stress 71296617572.4
it: 16, stress 71255286537.3
it: 17, stress 71223762168.8
it: 18, stress 71199292580.9
it: 19, stress 71179949212.1
it: 20, stress 71164481782.2
it: 21, stress 71152072859.7
it: 22, stress 71142007186.6
it: 23, stress 71133764718.1
it: 24, stress 71126979471.2
it: 25, stress 71121353426.1
it: 26, stress 71116668432.8
it: 27, stress 71112772600.1
it: 28, stress 71109541457.8
it: 29, stress 71106851153.1
it: 30, stress 71104607031.1
it: 31, stress 71102739401.0
it: 32, stress 71101183172.3
it: 33, stress 71099875633.3
it: 34, stress 71098768435.5
it: 35, stress 71097816845.7
it: 36, stress 71096988332.7
it: 37, stress 71096257499.0
it: 38, stress 71095607002.8
it: 39, stress 71095021285.3
it: 40, stress 71094488716.3
it: 41, stress 71094000301.5
it: 42, stress 71093548933.1
it: 43, stress 71093128653.4
it: 44, stress 71092733925.8
it: 45, stress 71092360377.1
it: 46, stress 71092004221.2
it: 47, stress 71091663258.8
it: 48, stress 71091335169.2
it: 49, stress 71091017846.6
it: 50, stress 71090709939.7
it: 51, stress 71090409964.2
it: 52, stress 71090116595.6
it: 53, stress 71089828774.4
it: 54, stress 71089545733.6
it: 55, stress 71089266558.3
it: 56, stress 71088990524.4
it: 57, stress 71088717006.9
it: 58, stress 71088445420.4
it: 59, stress 71088175258.5
it: 60, stress 71087906066.2
it: 61, stress 71087637378.7
it: 62, stress 71087368955.1
it: 63, stress 71087100526.7
it: 64, stress 71086831827.6
it: 65, stress 71086562461.7
it: 66, stress 71086292087.8
it: 67, stress 71086020478.4
it: 68, stress 71085747386.0
it: 69, stress 71085472730.2
it: 70, stress 71085196624.4
it: 71, stress 71084918964.1
it: 72, stress 71084639254.0
it: 73, stress 71084357935.0
it: 74, stress 71084075053.5
it: 75, stress 71083790510.2
it: 76, stress 71083504019.5
it: 77, stress 71083215362.3
it: 78, stress 71082924512.6
it: 79, stress 71082631407.0
it: 80, stress 71082335499.9
it: 81, stress 71082036916.2
it: 82, stress 71081735484.5
it: 83, stress 71081431133.8
it: 84, stress 71081123535.7
it: 85, stress 71080812897.1
it: 86, stress 71080499089.1
it: 87, stress 71080181924.1
it: 88, stress 71079860936.0
it: 89, stress 71079535779.2
it: 90, stress 71079206728.0
it: 91, stress 71078873795.8
it: 92, stress 71078537037.2
it: 93, stress 71078196406.9
it: 94, stress 71077851331.3
it: 95, stress 71077501877.6
it: 96, stress 71077147883.1
it: 97, stress 71076789811.3
it: 98, stress 71076427875.8
it: 99, stress 71076061925.8
it: 100, stress 71075691846.6
it: 101, stress 71075317173.5
it: 102, stress 71074937846.9
it: 103, stress 71074553112.5
it: 104, stress 71074163087.1
it: 105, stress 71073767894.1
it: 106, stress 71073367383.7
it: 107, stress 71072961593.2
it: 108, stress 71072550529.4
it: 109, stress 71072134265.1
it: 110, stress 71071712656.9
it: 111, stress 71071285094.5
it: 112, stress 71070851559.1
it: 113, stress 71070412002.4
it: 114, stress 71069965862.3
it: 115, stress 71069512600.3
it: 116, stress 71069052094.2
it: 117, stress 71068584613.9
it: 118, stress 71068110489.5
it: 119, stress 71067629271.1
it: 120, stress 71067141013.6
it: 121, stress 71066645201.8
it: 122, stress 71066141791.4
it: 123, stress 71065630364.6
it: 124, stress 71065111188.8
it: 125, stress 71064584258.1
it: 126, stress 71064049485.7
it: 127, stress 71063507332.0
it: 128, stress 71062956998.1
it: 129, stress 71062397096.4
it: 130, stress 71061827448.5
it: 131, stress 71061248857.2
it: 132, stress 71060661844.1
it: 133, stress 71060066395.5
it: 134, stress 71059462072.6
it: 135, stress 71058849284.2
it: 136, stress 71058227167.3
it: 137, stress 71057595719.7
it: 138, stress 71056954944.5
it: 139, stress 71056304452.5
it: 140, stress 71055644487.5
it: 141, stress 71054975257.6
it: 142, stress 71054295790.4
it: 143, stress 71053605822.7
it: 144, stress 71052905116.4
it: 145, stress 71052193115.8
it: 146, stress 71051469712.6
it: 147, stress 71050733406.1
it: 148, stress 71049983169.4
it: 149, stress 71049218694.0
it: 150, stress 71048439873.6
it: 151, stress 71047645914.9
it: 152, stress 71046838035.2
it: 153, stress 71046017042.2
it: 154, stress 71045182747.6
it: 155, stress 71044334485.1
it: 156, stress 71043470813.0
it: 157, stress 71042593211.5
it: 158, stress 71041701958.5
it: 159, stress 71040796935.9
it: 160, stress 71039876791.0
it: 161, stress 71038939733.6
it: 162, stress 71037985279.9
it: 163, stress 71037015574.4
it: 164, stress 71036030270.7
it: 165, stress 71035028529.3
it: 166, stress 71034010937.7
it: 167, stress 71032978067.3
it: 168, stress 71031929932.0
it: 169, stress 71030867010.9
it: 170, stress 71029788508.5
it: 171, stress 71028693180.4
it: 172, stress 71027580162.9
it: 173, stress 71026448473.8
it: 174, stress 71025297422.7
it: 175, stress 71024127304.3
it: 176, stress 71022937446.0
it: 177, stress 71021727361.4
it: 178, stress 71020496464.9
it: 179, stress 71019246272.1
it: 180, stress 71017975752.1
it: 181, stress 71016685016.8
it: 182, stress 71015376053.9
it: 183, stress 71014048196.9
it: 184, stress 71012700469.4
it: 185, stress 71011332116.4
it: 186, stress 71009940105.6
it: 187, stress 71008522381.1
it: 188, stress 71007079451.9
it: 189, stress 71005612128.3
it: 190, stress 71004117697.1
it: 191, stress 71002597534.6
it: 192, stress 71001051762.9
it: 193, stress 70999479472.0
it: 194, stress 70997881174.3
it: 195, stress 70996258437.9
it: 196, stress 70994610582.0
it: 197, stress 70992933682.9
it: 198, stress 70991227553.7
it: 199, stress 70989491970.3
it: 200, stress 70987726558.7
it: 201, stress 70985933185.8
it: 202, stress 70984110941.8
it: 203, stress 70982258867.1
it: 204, stress 70980375659.2
it: 205, stress 70978460804.8
it: 206, stress 70976514567.1
it: 207, stress 70974535192.5
it: 208, stress 70972522222.8
it: 209, stress 70970475991.8
it: 210, stress 70968394655.3
it: 211, stress 70966274835.1
it: 212, stress 70964119559.2
it: 213, stress 70961929767.4
it: 214, stress 70959706090.1
it: 215, stress 70957449228.4
it: 216, stress 70955157947.6
it: 217, stress 70952829877.5
it: 218, stress 70950460424.5
it: 219, stress 70948045466.4
it: 220, stress 70945585894.0
it: 221, stress 70943081683.9
it: 222, stress 70940535260.5
it: 223, stress 70937941957.3
it: 224, stress 70935303491.2
it: 225, stress 70932620454.5
it: 226, stress 70929892391.3
it: 227, stress 70927117924.9
it: 228, stress 70924298185.9
it: 229, stress 70921432003.0
it: 230, stress 70918519600.5
it: 231, stress 70915561207.2
it: 232, stress 70912557300.3
it: 233, stress 70909506988.1
it: 234, stress 70906405121.5
it: 235, stress 70903252645.5
it: 236, stress 70900050173.6
it: 237, stress 70896799950.2
it: 238, stress 70893499135.4
it: 239, stress 70890143998.0
it: 240, stress 70886732706.9
it: 241, stress 70883268888.3
it: 242, stress 70879756418.7
it: 243, stress 70876194769.8
it: 244, stress 70872582574.4
it: 245, stress 70868911259.6
it: 246, stress 70865176459.6
it: 247, stress 70861375167.7
it: 248, stress 70857507885.6
it: 249, stress 70853581719.0
it: 250, stress 70849594188.0
it: 251, stress 70845548636.0
it: 252, stress 70841448936.5
it: 253, stress 70837295446.6
it: 254, stress 70833085171.1
it: 255, stress 70828816455.2
it: 256, stress 70824484769.2
it: 257, stress 70820090411.5
it: 258, stress 70815632250.5
it: 259, stress 70811107993.7
it: 260, stress 70806524696.9
it: 261, stress 70801879869.8
it: 262, stress 70797166076.1
it: 263, stress 70792385834.5
it: 264, stress 70787539781.0
it: 265, stress 70782623365.9
it: 266, stress 70777635314.6
it: 267, stress 70772572099.3
it: 268, stress 70767441051.0
it: 269, stress 70762241740.5
it: 270, stress 70756977805.3
it: 271, stress 70751656712.9
it: 272, stress 70746272339.5
it: 273, stress 70740810053.2
it: 274, stress 70735267323.0
it: 275, stress 70729644057.6
it: 276, stress 70723947487.8
it: 277, stress 70718176248.9
it: 278, stress 70712332016.5
it: 279, stress 70706412006.8
it: 280, stress 70700415640.9
it: 281, stress 70694339773.0
it: 282, stress 70688182725.7
it: 283, stress 70681950868.2
it: 284, stress 70675638732.1
it: 285, stress 70669249491.4
it: 286, stress 70662778447.3
it: 287, stress 70656232000.5
it: 288, stress 70649614446.6
it: 289, stress 70642917312.4
it: 290, stress 70636130266.6
it: 291, stress 70629252120.7
it: 292, stress 70622287916.2
it: 293, stress 70615241803.9
it: 294, stress 70608110083.7
it: 295, stress 70600887402.8
it: 296, stress 70593566543.2
it: 297, stress 70586155573.0
it: 298, stress 70578643640.3
it: 299, stress 70571021689.3
In [48]:
dataset_2015 = []
for row in range(5408, len(dataset_old.values)):
    dataset_2015.append(dataset_old.iloc[row, :])

dataset_2015 = pd.DataFrame(data=dataset_2015)
dataset_2015.shape
Out[48]:
(403, 11464)
In [42]:
from sklearn.decomposition import SparsePCA

spca = SparsePCA(n_components=2).fit(dataset_2015.iloc[:, 1:])
X_SPCA = spca.transform(dataset_2015.iloc[:, 1:])
plt.scatter(X_SPCA[:, 0], X_SPCA[:, 1])
plt.show()

Redução de Dimensionalidade por LSA

In [49]:
dataset_2015 = dataset_2015.drop('Unnamed: 0', axis=1)
dataset_2015.head()
Out[49]:
abalone abbeel abbott abbreviate abbreviated abc abeles abernethy abilistic abilities ... zhou zhu zien zilberstein zones zoo zoom zou zoubin zurich
5408 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
5409 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
5410 0 0 0 0 0 0 0 0 0 0 ... 0 4 0 0 0 0 0 0 1 0
5411 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5412 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 11463 columns

In [50]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from time import time

t0 = time()

svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(dataset_2015)

print('Done %fs' % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))
Done 0.199114s
Explained variance of the SVD step: 5%
In [56]:
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.title('Dendrograma')
plt.xlabel('PCA0')
plt.ylabel('PCA1')
plt.show()
In [57]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean',
                            linkage = 'ward')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'green')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'cyan')
plt.show()
In [58]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300,
                   n_init = 10)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.show()
In [59]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300,
               n_init = 10)
y_kmeans = kmeans.fit_predict(X)

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'green')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'cyan')
plt.show()

Todo o dataset

In [10]:
y = dataset_old.iloc[:, 0]
dataset_old = dataset_old.drop(dataset_old.columns[0], axis = 1)
In [76]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from time import time

t0 = time()

svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(dataset_old)

print('Done %fs' % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))
Done 2.434013s
Explained variance of the SVD step: 5%
In [80]:
plt.scatter(X[:, 0], X[:, 1])
plt.show()
In [77]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300,
                   n_init = 10)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.show()
In [78]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300,
               n_init = 10)
y_kmeans = kmeans.fit_predict(X)

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'green')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'cyan')
plt.show()
In [79]:
dataset_old.shape
Out[79]:
(5811, 11462)
In [83]:
from sklearn.cluster import MiniBatchKMeans
wcss = []
for i in range(1, 11):
    km = MiniBatchKMeans(n_clusters=i, init='k-means++',
                             n_init=1, init_size=1000, batch_size=1000)
    km.fit(X)
    wcss.append(km.inertia_)

plt.plot(range(1, 11), wcss)
plt.show()
In [84]:
kmeans = MiniBatchKMeans(n_clusters=3, init='k-means++',
                             n_init=1, init_size=1000, batch_size=1000)
y_kmeans = kmeans.fit_predict(X)

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'green')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'cyan')
plt.show()
In [86]:
from sklearn.cluster import KMeans
wcss = []
for i in range(10, 30):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300,
                   n_init = 10)
    kmeans.fit(dataset_old)
    wcss.append(kmeans.inertia_)

plt.plot(range(10, 30), wcss)
plt.show()
In [11]:
from sklearn.cluster import KMeans
wcss = []
start = 1
end = 100
for i in range(start, end):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300,
                   n_init = 10, n_jobs = -1)
    kmeans.fit(dataset_old)
    wcss.append(kmeans.inertia_)

plt.plot(range(start, end), wcss)
plt.show()
In [12]:
from sklearn.cluster import KMeans
wcss = []
start = 1000
end = 1010
for i in range(start, end):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300,
                   n_init = 10, n_jobs = -1)
    kmeans.fit(dataset_old)
    wcss.append(kmeans.inertia_)

plt.plot(range(start, end), wcss)
plt.show()
In [14]:
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(dataset_old, method = 'ward'))
plt.title('Dendrograma')
plt.show()