# make sure numpy, scipy, pandas, sklearn are installed, otherwise run # pip install numpy scipy pandas scikit-learn import numpy as np import pandas as pd from scipy import io from sklearn.metrics import roc_auc_score from sklearn.ensemble import RandomForestClassifier # load data y_tr = pd.read_csv('tox21_labels_train.csv.gz', index_col=0, compression="gzip") y_te = pd.read_csv('tox21_labels_test.csv.gz', index_col=0, compression="gzip") x_tr_dense = pd.read_csv('tox21_dense_train.csv.gz', index_col=0, compression="gzip").values x_te_dense = pd.read_csv('tox21_dense_test.csv.gz', index_col=0, compression="gzip").values x_tr_sparse = io.mmread('tox21_sparse_train.mtx.gz').tocsc() x_te_sparse = io.mmread('tox21_sparse_test.mtx.gz').tocsc() # filter out very sparse features sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel() x_tr = np.hstack([x_tr_dense, x_tr_sparse[:, sparse_col_idx].A]) x_te = np.hstack([x_te_dense, x_te_sparse[:, sparse_col_idx].A]) # Build a random forest model for all twelve assays for target in y_tr.columns: rows_tr = np.isfinite(y_tr[target]).values rows_te = np.isfinite(y_te[target]).values rf = RandomForestClassifier(n_estimators=100, n_jobs=4) rf.fit(x_tr[rows_tr], y_tr[target][rows_tr]) p_te = rf.predict_proba(x_te[rows_te]) auc_te = roc_auc_score(y_te[target][rows_te], p_te[:, 1]) print("%15s: %3.5f" % (target, auc_te))