Train and test an NMF model for a type of 1D monitoring element

This notebook showcases how to use an NMF model for a 1D monitoring element.
It consists of the following steps:

You can use this notebook both for global training (training on a large dataset) and for local training (training on a small number of well-chosen runs).

### imports

# external modules
import sys
import os
import numpy as np
import matplotlib.pyplot as plt

# local modules
sys.path.append('../utils')
import csv_utils as csvu
import dataframe_utils as dfu
import json_utils as jsonu
import hist_utils as hu
import autoencoder_utils as aeu
import plot_utils as pu
import generate_data_utils as gdu

sys.path.append('../src')
sys.path.append('../src/classifiers')
sys.path.append('../src/cloudfitters')
import DataLoader
import HistStruct
import ModelInterface
import NMFClassifier
import IdentityFitter
2022-07-26 17:24:20.766349: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cvmfs/sft.cern.ch/lcg/releases/MCGenerators/thepeg/2.2.1-8d929/x86_64-centos7-gcc8-opt/lib/ThePEG:/cvmfs/sft.cern.ch/lcg/releases/MCGenerators/herwig++/7.2.1-f3599/x86_64-centos7-gcc8-opt/lib/Herwig:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/torch/lib:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/tensorflow:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/tensorflow/contrib/tensor_forest:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/tensorflow/python/framework:/cvmfs/sft.cern.ch/lcg/releases/java/8u222-884d8/x86_64-centos7-gcc8-opt/jre/lib/amd64:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib64:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/8.3.0-cebb0/x86_64-centos7/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/8.3.0-cebb0/x86_64-centos7/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.30-e5b21/x86_64-centos7/lib:/usr/local/lib/:/cvmfs/sft.cern.ch/lcg/releases/R/3.6.3-dfb24/x86_64-centos7-gcc8-opt/lib64/R/library/readr/rcon
2022-07-26 17:24:20.766409: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
### define run properties
# in this cell all major run properties are going to be set,
# e.g. what runs to train on and what runs to test on

# define a list of good 'reference' runs 
# (e.g. found by eye)
goodrunsls = {'2017':
                {
                "297056":[[-1]],
                "297177":[[-1]],
                "301449":[[-1]],
                }
             }

# define core test set of clearly bad runs 
# (e.g. found by eye)
badrunsls = {'2017':
                {
                "297287":[[-1]],
                "297288":[[-1]],
                "297289":[[-1]],
                "299316":[[-1]],
                "299324":[[-1]],
                #"299326":[[-1]],
                }
            }

# set year to use
year = '2017'

# set histogram names to use 
histname = 'chargeInner_PXLayer_2'

# set whether to train globally or locally
training_mode = 'global'

if training_mode == 'global':
    runsls_training = None # use none to not add a mask for training (can e.g. use DCS-bit on mask)
    runsls_good = None # use none to not add a mask for good runs (can e.g. use templates)
    runsls_bad = badrunsls[year] # predefined bad runs
    print('selected runs/lumisections for training: all')

elif training_mode == 'local':
    # train locally on a small set of runs
    # for now on n runs preceding a chosen application run,
    # to be extended with choosing reference runs.

    # select application run
    available_runs = dfu.get_runs( dfu.select_dcson( csvu.read_csv('../data/DF'+year+'_'+histname+'.csv') ) )
    run_application = 305351
    run_application_index = available_runs.index(run_application)
    # select training set
    ntraining = 5
    runsls_training = jsonu.tuplelist_to_jsondict([(el,[-1]) for el in available_runs[run_application_index-ntraining:run_application_index]])
    runsls_bad = badrunsls[year]
    runsls_good = jsonu.tuplelist_to_jsondict([(run_application,[-1])])
    print('selected runs/lumisections for training: ')
    print(runsls_training)
    print('selected runs/lumisections as good test set:')
    print(runsls_good)
    print('selected runs/lumisections as bad test set:')
    print(runsls_bad)
selected runs/lumisections for training: all
### read the data based on the configuration defined above

readnew = True

if readnew:

    # initializations
    dloader = DataLoader.DataLoader()
    histstruct = HistStruct.HistStruct()
    print('adding {}...'.format(histname))
    # read the histograms from the correct csv files
    filename = '../data/DF'+year+'_'+histname+'.csv'
    df = dloader.get_dataframe_from_file( filename )
    # in case of local training, we can remove most of the histograms
    if( runsls_training is not None and runsls_good is not None and runsls_bad is not None ):
        runsls_total = {k: v for d in (runsls_training, runsls_good, runsls_bad) for k, v in d.items()}
        df = dfu.select_runsls( df, runsls_total )
    histstruct.add_dataframe( df )
    print('found {} histograms'.format(len(histstruct.runnbs)))

    # add masks
    histstruct.add_dcsonjson_mask( 'dcson' )
    histstruct.add_goldenjson_mask('golden' )
    histstruct.add_highstat_mask( 'highstat' )
    if runsls_training is not None: histstruct.add_json_mask( 'training', runsls_training )
    if runsls_good is not None: histstruct.add_json_mask( 'good', runsls_good )
    nbadruns = 0
    if runsls_bad is not None:
        histstruct.add_json_mask( 'bad', runsls_bad )
        # special case for bad runs: add a mask per run (different bad runs have different characteristics)
        nbadruns = len(runsls_bad.keys())
        for i,badrun in enumerate(runsls_bad.keys()):
            histstruct.add_json_mask( 'bad{}'.format(i), {badrun:runsls_bad[badrun]} )

if not readnew:

    histstruct = HistStruct.HistStruct.load( hsfilename )
    nbadruns = len([name for name in list(histstruct.masks.keys()) if 'bad' in name])

print('created a histstruct with the following properties:')
print('- number of histogram types: {}'.format(len(histstruct.histnames)))
print('- number of lumisections: {}'.format(len(histstruct.lsnbs)))
print('- masks: {}'.format(list(histstruct.masks.keys())))
adding chargeInner_PXLayer_2...
INFO in DataLoader.get_dataframe_from_file: loading dataframe from file ../data/DF2017_chargeInner_PXLayer_2.csv...
INFO in DataLoader.get_dataframe_from_file: sorting the dataframe...
INFO in DataLoader.get_dataframe_from_file: loaded a dataframe with 225954 rows and 16 columns.
found 225954 histograms
created a histstruct with the following properties:
- number of histogram types: 1
- number of lumisections: 225954
- masks: ['dcson', 'golden', 'highstat', 'bad', 'bad0', 'bad1', 'bad2', 'bad3', 'bad4']
skipthiscell = False

if( training_mode=='local' and not skipthiscell ):

    # training and application runs
    histstruct.plot_histograms( masknames=[['dcson','highstat','training'],['highstat','good']],
                                labellist = ['training','testing'],
                                colorlist = ['blue','green']
                              )

    # application run and bad test runs
    histstruct.plot_histograms( masknames=[['dcson','highstat','good'],['bad']],
                                labellist = ['good','bad'],
                                colorlist = ['green','red']
                              )
### extend the training set using artificial data

extendtraining = False

if extendtraining:
    histstruct.exthistograms['training'] = {}
    print('generating artificial training data for '+histname)
    hists = histstruct.get_histograms( histname=histname, masknames=['dcson','highstat','training'] )
    print('  original number of histograms: {}'.format(len(hists)))
    (exthists,_,_) = gdu.upsample_hist_set(hists, 5e4, doplot=True )
    histstruct.add_exthistograms( 'training', histname, exthists )
    print('  -> generated {} histograms'.format(len(histstruct.exthistograms['training'][histname])))
### define and train an NMF model

modelname = 'nmfclassifier'
model = ModelInterface.ModelInterface( histstruct.histnames )

if training_mode=='local':
    training_masks = ['dcson','highstat','training']
    hists_train = histstruct.get_histograms( histname=histname, masknames=training_masks )
elif training_mode=='global':
    training_masks = ['dcson','highstat']
    # use all available data for training (with DCS-on and statistics selection)
    #hists_train = histstruct.get_histograms( histname=histname, masknames=training_masks )
    # this can however take a long time... alternatively, use averaged histograms for training
    hists_train = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=training_masks ), 1000 )
if extendtraining: hists_train = histstruct.get_exthistograms( 'training', histname=histname )
classifier = NMFClassifier.NMFClassifier( ncomponents=3 )
classifier.train( hists_train )
classifier.set_nmax( 10 )
classifier.set_loss_type( 'chi2' )

model.set_classifiers( {histname: classifier} )
histstruct.add_model( modelname, model )
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/sklearn/decomposition/_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/sklearn/decomposition/_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"

png

adding model "nmfclassifier" to the HistStruct
### plot the NMF components

components = classifier.get_components()
_ = pu.plot_hists_multi( components, colorlist=list(range(len(components))), xaxtitle='bin number', yaxtitle='arbitrary units', title='NMF components' )

png

### evaluate the models on all histograms in the (non-extended) histstruct

print('evaluating model for '+histname)
histstruct.evaluate_classifier( modelname, histname )
evaluating model for chargeInner_PXLayer_2
### train a density fitter on the scores obtained by the NMF model

histstruct.set_fitter( modelname, IdentityFitter.IdentityFitter() )
histstruct.train_fitter( modelname, masknames=training_masks )
histstruct.evaluate_fitter( modelname )
### extend the test set using artificial data generation and evaluate the model on the extended test set

# make the extra data
print('generating data for '+histname)
if 'good' in histstruct.masks.keys():
    goodhists = histstruct.get_histograms( histname=histname,masknames=['dcson','highstat','good'] )
else:
    goodhists = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] ), 15 )
(goodexthists,_,_) = gdu.upsample_hist_set( goodhists,ntarget=nbadruns*5e3,fourierstdfactor=20., doplot=False)
histstruct.add_exthistograms( 'good', histname, goodexthists, overwrite=True )
# alternative: copy original good set (e.g. for using resampled bad but original good)
#histstruct.add_exthistograms( 'good', histname, goodhists )
for i in range(nbadruns):
    badhists = histstruct.get_histograms( histname=histname,masknames=['dcson','highstat','bad{}'.format(i)] )
    (badexthists,_,_) = gdu.upsample_hist_set( badhists,ntarget=5e3,fourierstdfactor=20., doplot=False)
    histstruct.add_exthistograms( 'bad{}'.format(i), histname, badexthists, overwrite=True )

# evaluate the classifiers
print('evaluating: '+histname)
histstruct.evaluate_classifier( modelname, histname, setnames=['good'] )
for i in range(nbadruns):
    histstruct.evaluate_classifier( modelname, histname, setnames=['bad{}'.format(i)])

# evaluate the fitter
print('evaluating fitter')
histstruct.evaluate_fitter( modelname, setnames=['good'] )
for i in range(nbadruns):
    histstruct.evaluate_fitter( modelname, setnames=['bad{}'.format(i)] )
generating data for chargeInner_PXLayer_2
evaluating: chargeInner_PXLayer_2
evaluating fitter
### get the scores

# get the log probability for good set
prob_good = histstruct.get_globalscores( modelname, setnames=['good'] )
logprob_good = np.log(prob_good)
# get the log probability for bad set
prob_bad = histstruct.get_globalscores( modelname, setnames=['bad{}'.format(i) for i in range(nbadruns)] )
logprob_bad = np.log(prob_bad)
### make a roc curve based on the test results above

labels_good = np.zeros(len(logprob_good)) # background: label = 0
labels_bad = np.ones(len(logprob_bad)) # signal: label = 1

labels = np.concatenate(tuple([labels_good,labels_bad]))
scores = np.concatenate(tuple([-logprob_good,-logprob_bad]))

pu.plot_score_dist(scores, labels, nbins=1000, normalize=True)

auc = aeu.get_roc(scores, labels, mode='geom', bootstrap_samples=100, doprint=False)

aeu.get_confusion_matrix(scores, labels, wp='maxauc')

png

calculating ROC curve on 100 bootstrap samples of size 49958

png

png

(-4.156618406017773, <Figure size 432x288 with 2 Axes>, <AxesSubplot:>)

png

### plot some random examples

# define reference histograms
refhists = {}
if( 'good' in histstruct.masks.keys() ): 
    refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['highstat','dcson','good']), 15 )
else: 
    refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] ), 15 )

# define number of plots to make
nplot = 10

# make plots for good histograms
print('example histograms from the good test set:')
runnbs_good = histstruct.get_runnbs( masknames=['highstat','dcson'] )
lsnbs_good = histstruct.get_lsnbs( masknames=['highstat','dcson'] )
indices = np.random.choice( np.arange(len(runnbs_good)), size=nplot, replace=False )
for i in indices:
    _ = histstruct.plot_ls( runnbs_good[i], lsnbs_good[i], recohist='nmfclassifier', refhists=refhists )
    plt.show()

# make plots for bad histograms
print('example histograms from the bad test set:')
runnbs_bad = histstruct.get_runnbs( masknames=['dcson','bad'] )
lsnbs_bad = histstruct.get_lsnbs( masknames=['dcson','bad'] )
indices = np.random.choice( np.arange(len(runnbs_bad)), size=nplot, replace=False )
for i in indices:
    _ = histstruct.plot_ls( runnbs_bad[i], lsnbs_bad[i], recohist='nmfclassifier', refhists=refhists )
    plt.show()
example histograms from the good test set:

png

png

png

png

png

png

png

png

png

png

example histograms from the bad test set:

png

png

png

png

png

png

png

png

png

png

### investigate particular lumisections

# initialization: general
mode = 'run'
run = 299316
# for mode 'ls' (ignored if mode is 'run'):
ls = 70
# for mode 'run' (ignored if mode is 'ls'):
run_masknames = ['dcson','highstat']

# initialization: reference scores
plot_refscores = True
refscore_masknames = ['dcson','highstat']

# initialization: reference histograms
refhists = {}
for histname in histstruct.histnames: 
    refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['highstat','dcson']), 50 )

if mode=='ls':
    # plot this particular run/ls
    fig,axs = histstruct.plot_ls( run, ls, recohist=None, refhists=refhists, 
                                opaque_legend=True,
                                ncols = 3,
                                physicalxax = True,
                                ymaxfactor = 1.3,
                                legendsize = 13
                              )

    # print the mses
    msepoint = histstruct.get_scores_ls( modelname, run, ls )
    logprob = np.log( histstruct.evaluate_fitter_on_point( modelname, msepoint ) )
    print('-------------')
    print('MSE values:')
    for histname in histstruct.histnames: print('{} : {}'.format(histname,msepoint[histname]))
    print('-------------')
    print('logprob: '+str(logprob))

    # plot mse distribution
    if plot_refscores:
        fig,axs = histstruct.plot_ls_score( modelname, run, ls, masknames=refscore_masknames, 
                        nbins=100, normalize=True,
                        siglabel='This lumisection', bcklabel='All lumisections',
                        sigcolor='k', bckcolor='b',
                        title=None, 
                        xaxtitle='MSE', xaxtitlesize=15,
                        yaxtitle='Normalized number of lumisections', yaxtitlesize=15,
                        doshow=False)


if mode=='run':
    # plot given run
    runnbs = histstruct.get_runnbs( masknames=run_masknames )
    lsnbs = histstruct.get_lsnbs( masknames=run_masknames )
    runsel = np.where(runnbs==run)
    lsnbs = lsnbs[runsel]
    print('plotting {} lumisections...'.format(len(lsnbs)))
    for lsnb in lsnbs:
        fig,ax = histstruct.plot_ls(run, lsnb, recohist=None, refhists=refhists, opaque_legend=True )
        plt.show()
        msepoint = histstruct.get_scores_ls( modelname, run, lsnb )
        msepointarray = np.array([msepoint[histname] for histname in histstruct.histnames])
        logprob = np.log( histstruct.evaluate_fitter_on_point( modelname, msepoint ) )
        print('-------------')
        print('MSE values:')
        for histname in histstruct.histnames: print('{} : {}'.format(histname,msepoint[histname]))
        print('-------------')
        print('logprob: '+str(logprob))
plotting 43 lumisections...

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11742876]
-------------
logprob: [2.1419234]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11759748]
-------------
logprob: [2.14048768]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11930701]
-------------
logprob: [2.12605523]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11448997]
-------------
logprob: [2.16726804]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11787179]
-------------
logprob: [2.1381578]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11709544]
-------------
logprob: [2.14476592]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11491754]
-------------
logprob: [2.16354048]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11651301]
-------------
logprob: [2.14975236]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11659569]
-------------
logprob: [2.14904299]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11531861]
-------------
logprob: [2.16005649]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11699337]
-------------
logprob: [2.14563803]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11511875]
-------------
logprob: [2.16179106]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11746122]
-------------
logprob: [2.141647]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11491104]
-------------
logprob: [2.16359701]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11559568]
-------------
logprob: [2.15765668]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11706182]
-------------
logprob: [2.14505314]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11533168]
-------------
logprob: [2.15994317]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11316278]
-------------
logprob: [2.178928]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11744022]
-------------
logprob: [2.14182586]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11295069]
-------------
logprob: [2.18080389]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11432042]
-------------
logprob: [2.16875009]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11577394]
-------------
logprob: [2.15611574]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11596593]
-------------
logprob: [2.15445883]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11252853]
-------------
logprob: [2.1845485]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11418823]
-------------
logprob: [2.16990708]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11592322]
-------------
logprob: [2.15482724]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11453416]
-------------
logprob: [2.16688216]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11061984]
-------------
logprob: [2.20165583]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11639396]
-------------
logprob: [2.15077467]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11509686]
-------------
logprob: [2.16198121]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11489376]
-------------
logprob: [2.16374741]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11569846]
-------------
logprob: [2.15676797]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11598947]
-------------
logprob: [2.15425584]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11440008]
-------------
logprob: [2.16805346]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11418719]
-------------
logprob: [2.16991614]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11207464]
-------------
logprob: [2.18859017]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11372892]
-------------
logprob: [2.17393758]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11627835]
-------------
logprob: [2.1517684]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11539924]
-------------
logprob: [2.15935751]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11177484]
-------------
logprob: [2.1912688]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11515268]
-------------
logprob: [2.16149634]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11584298]
-------------
logprob: [2.15551965]

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.11319422]
-------------
logprob: [2.17865016]