Train a combined model with global training and evaluate its predictions

This notebook uses multiple types of histograms, trains a single-histogram classifier for each of them, combines the output and assesses the performance of this combined model.

### imports

# external modules
import os
import sys
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

# local modules
sys.path.append('../utils')
import csv_utils as csvu
import json_utils as jsonu
import dataframe_utils as dfu
import hist_utils as hu
import autoencoder_utils as aeu
import plot_utils as pu
import generate_data_utils as gdu
import refruns_utils as rru
# import general source
sys.path.append('../src')
sys.path.append('../src/classifiers')
sys.path.append('../src/cloudfitters')
import Model
import ModelInterface
import HistStruct
import DataLoader
import PlotStyleParser
# import classifiers
import AutoEncoder
import TemplateBasedClassifier
import NMFClassifier
import LandauClassifier
import MomentClassifier
# import combination methods
import SeminormalFitter
2022-07-26 17:23:22.471603: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cvmfs/sft.cern.ch/lcg/releases/MCGenerators/thepeg/2.2.1-8d929/x86_64-centos7-gcc8-opt/lib/ThePEG:/cvmfs/sft.cern.ch/lcg/releases/MCGenerators/herwig++/7.2.1-f3599/x86_64-centos7-gcc8-opt/lib/Herwig:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/torch/lib:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/tensorflow:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/tensorflow/contrib/tensor_forest:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/tensorflow/python/framework:/cvmfs/sft.cern.ch/lcg/releases/java/8u222-884d8/x86_64-centos7-gcc8-opt/jre/lib/amd64:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib64:/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/8.3.0-cebb0/x86_64-centos7/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/8.3.0-cebb0/x86_64-centos7/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.30-e5b21/x86_64-centos7/lib:/usr/local/lib/:/cvmfs/sft.cern.ch/lcg/releases/R/3.6.3-dfb24/x86_64-centos7-gcc8-opt/lib64/R/library/readr/rcon
2022-07-26 17:23:22.471647: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
### define titles and axis properties for figures

plotstyleparser = PlotStyleParser.PlotStyleParser('plotstyle.json')
titledict = plotstyleparser.get_title()
xaxtitledict = plotstyleparser.get_xaxtitle()
yaxtitledict = plotstyleparser.get_yaxtitle()
for key in yaxtitledict.keys(): yaxtitledict[key] += ' (normalized)' # if normalized
extratextdict = plotstyleparser.get_extratext()
### define run properties
# in this cell all major run properties are going to be set,
# e.g. what runs to train on and what runs to test on

# define core test set of bad runs
badrunsls = {'2017':
                {
                "297287":[[-1]],
                "297288":[[-1]],
                "297289":[[-1]],
                "299316":[[-1]],
                "299317":[[-1]],
                "299318":[[-1]],
                "299324":[[-1]],
                "299325":[[-1]],
                "299326":[[-1]],
                "300373":[[-1]],
                "300374":[[-1]],
                "300397":[[-1]],
                "300398":[[-1]]
                }
            }

# set year to use
year = '2017'

# set histogram names to use 
histnames = ([
             #'chargeInner_PXLayer_1',
             'chargeInner_PXLayer_2',
             'chargeInner_PXLayer_3',
             #'chargeInner_PXLayer_4',
             #'charge_PXDisk_+1','charge_PXDisk_+2','charge_PXDisk_+3',
             #'charge_PXDisk_-1','charge_PXDisk_-2','charge_PXDisk_-3',
            ])

# redefine badrunsls for this year only
badrunsls = badrunsls[year]
print('selected runs/lumisections for training: all')
selected runs/lumisections for training: all
### read the data based on the configuration defined above

readnew = True

if readnew:

    ### add the histograms
    # initializations
    dloader = DataLoader.DataLoader()
    histstruct = HistStruct.HistStruct()
    # loop over the histogram types to take into account
    for histname in histnames:
        print('adding {}...'.format(histname))
        # read the histograms from the correct csv files
        filename = '../data/DF'+year+'_'+histname+'.csv'
        df = dloader.get_dataframe_from_file( filename )
        # define slice to remove under- and overflow bins
        cropslices = [slice(1,-1)]
        # add the dataframe to the histstruct
        histstruct.add_dataframe( df, cropslices=cropslices )
    print('found {} histograms'.format(len(histstruct.runnbs)))

    ### add masks
    histstruct.add_dcsonjson_mask( 'dcson' )
    histstruct.add_goldenjson_mask('golden' )
    histstruct.add_highstat_mask( 'highstat' )
    histstruct.add_stat_mask( 'lowstat', max_entries_to_bins_ratio=100 )
    nbadruns = 0
    histstruct.add_json_mask( 'bad', badrunsls )
    # special case for bad runs: add a mask per run (different bad runs have different characteristics)
    nbadruns = len(badrunsls.keys())
    for badrun in badrunsls.keys():
        histstruct.add_json_mask( 'bad{}'.format(badrun), {badrun:badrunsls[badrun]} )

    print('created a histstruct with the following properties:')
    print('- number of histogram types: {}'.format(len(histstruct.histnames)))
    print('- number of lumisections: {}'.format(len(histstruct.lsnbs)))
    print('- masks: {}'.format(list(histstruct.masks.keys())))

if not readnew:

    histstruct = HistStruct.HistStruct.load( 'histstruct_global_20220201.zip', verbose=False )
    nbadruns = len([name for name in list(histstruct.masks.keys()) if ('bad' in name and name!='bad')])

    print('loaded a histstruct with the following properties:')
    print(histstruct)
adding chargeInner_PXLayer_2...
INFO in DataLoader.get_dataframe_from_file: loading dataframe from file ../data/DF2017_chargeInner_PXLayer_2.csv...
INFO in DataLoader.get_dataframe_from_file: sorting the dataframe...
INFO in DataLoader.get_dataframe_from_file: loaded a dataframe with 225954 rows and 16 columns.
adding chargeInner_PXLayer_3...
INFO in DataLoader.get_dataframe_from_file: loading dataframe from file ../data/DF2017_chargeInner_PXLayer_3.csv...
INFO in DataLoader.get_dataframe_from_file: sorting the dataframe...
INFO in DataLoader.get_dataframe_from_file: loaded a dataframe with 225954 rows and 16 columns.
found 225954 histograms
created a histstruct with the following properties:
- number of histogram types: 2
- number of lumisections: 225954
- masks: ['dcson', 'golden', 'highstat', 'lowstat', 'bad', 'bad297287', 'bad297288', 'bad297289', 'bad299316', 'bad299317', 'bad299318', 'bad299324', 'bad299325', 'bad299326', 'bad300373', 'bad300374', 'bad300397', 'bad300398']
### plot the training and/or test sets

doplot = True

if doplot:

    index_mask = np.random.choice( np.arange(len(histstruct.get_lsnbs())), size=20, replace=False )
    histstruct.add_index_mask( 'random_plotting', index_mask )
    # bad test runs
    for badrun in badrunsls.keys():
        fig,axs = histstruct.plot_histograms( masknames=[['dcson','highstat','random_plotting'],['dcson','highstat','bad{}'.format(badrun)]],
                                labellist = ['Example histograms','Run {}'.format(badrun)],
                                colorlist = ['blue','red'],
                                transparencylist = [0.1,1.],
                                ncols=3,
                                opaque_legend=True,
                                titledict = titledict, titlesize=15,
                                physicalxax = True,
                                xaxtitledict = xaxtitledict, xaxtitlesize=17,
                                yaxtitledict = yaxtitledict, yaxtitlesize=17,
                                ymaxfactor = 1.2,
                                legendsize = 14
                                )
        # stylistic modifications
        counter = -1
        for i in range(axs.shape[0]): 
            for j in range(axs.shape[1]):
                counter += 1
                if counter>=len(histstruct.histnames): break
                ax = axs[i,j]
                pu.add_cms_label( ax, pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
                pu.add_text( ax, extratextdict[histstruct.histnames[counter]], (0.95,0.6), fontsize=15, horizontalalignment='right' )
                pu.add_text( ax, '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )
        fig.subplots_adjust(wspace=0.3, hspace=0.3)
    histstruct.remove_mask('random_plotting')

png

png

png

png

png

png

png

png

png

png

png

png

png

### define a good test set as averages from training set

npartitions = 50

for histname in histstruct.histnames:
    histograms = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['dcson','highstat']), npartitions )
    histstruct.add_exthistograms( 'partitions', histname, histograms, overwrite=True )
### make a plot of the good test set

doplot = True

if doplot:

    fig,axs = histstruct.plot_histograms( histograms=[histstruct.get_histograms(setnames=['partitions'])],
                                labellist = ['Good test set'],
                                colorlist = ['blue'],
                                transparencylist = [0.1],
                                ncols=3,
                                opaque_legend=True,
                                titledict = titledict, titlesize=15,
                                physicalxax = True,
                                xaxtitledict = xaxtitledict, xaxtitlesize=17,
                                yaxtitledict = yaxtitledict, yaxtitlesize=17,
                                #ymaxfactor = 1.2,
                                legendsize = 14
                                )
    # stylistic modifications
    counter = -1
    for i in range(axs.shape[0]): 
        for j in range(axs.shape[1]):
            counter += 1
            if counter>=len(histstruct.histnames): break
            ax = axs[i,j]
            pu.add_cms_label( ax, pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
            pu.add_text( ax, extratextdict[histstruct.histnames[counter]], (0.95,0.6), fontsize=15, horizontalalignment='right' )
            pu.add_text( ax, '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )
    fig.subplots_adjust(wspace=0.3, hspace=0.3)

png

### print some numbers
print(len(histstruct.get_lsnbs()))
print(len(histstruct.get_lsnbs(masknames=['dcson'])))
print(len(histstruct.get_lsnbs(masknames=['dcson','highstat'])))
trainingmasks = ['dcson','highstat']
print(len(histstruct.get_lsnbs(masknames=trainingmasks)))
225954
215144
211171
211171
### define and train an autoencoder for each element

classifymethod = 'nmf'
modelname = classifymethod
if modelname in histstruct.modelnames:
    raise Exception('WARNING: modelname "{}" is already present in histstruct.'.format(modelname)
                   +' Choose a different name or remove the duplicate.')
model = ModelInterface.ModelInterface(histstruct.histnames)
classifiers = {}

if classifymethod == 'autoencoder':

        for histname in histstruct.histnames:
            print('building Autoencoder model for histogram type {}'.format(histname))
            hists = histstruct.get_histograms(histname=histname, masknames=['dcson','highstat'])
            print('size of training set: {}'.format(hists.shape))
            # choose whether to save the model
            modelname = modelbasename+'_'+histname+'.h5'
            modelname = os.path.join(modelloc, modelname)
            if not save: modelname = '' # empty string means do not save models
            # train the model
            (aemodel, history) = aeu.train_simple_autoencoder(hists,
                                                 nepochs=20,
                                                 modelname=modelname,
                                                 batch_size=2000,
                                                 shuffle=True,
                                                 returnhistory=True
                                                )
            # make a loss plot
            fig,ax = pu.plot_loss(history, xaxtitlesize=15, yaxtitlesize=15, legendsize=15, doshow=False)
            pu.add_cms_label( ax, pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
            pu.add_text( ax, '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )
            pu.add_text( ax, pu.make_text_latex_safe(histname), (0.95,0.6), fontsize=15, horizontalalignment='right' )
            plt.show()
            # add the model to the histstruct
            classifiers[histname] = AutoEncoder.AutoEncoder( model=aemodel ) 

if classifymethod == 'templates':

    for histname in histstruct.histnames:
        hists = histstruct.get_histograms(histname=histname, masknames=['dcson','highstat'])
        hists = hu.averagehists( hists, 25 )
        classifier = TemplateBasedClassifier.TemplateBasedClassifier()
        classifier.train( hists )
        classifiers[histname] = classifier

if classifymethod == 'nmf':

    for histname in histstruct.histnames:
        print('building NMF model for histogram type {}'.format(histname))
        hists = histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'])
        # option: reduce size of training set by choosing randoms
        random_indices = np.random.choice(len(hists), size=int(5e4), replace=False)
        hists = hists[random_indices]
        print('size of training set: {}'.format(hists.shape))
        classifier = NMFClassifier.NMFClassifier( ncomponents=3, nmax=10 )
        classifier.train( hists )
        classifiers[histname] = classifier

if classifymethod == 'landaufit':

    print('initializing Landau fit classifier')
    classifier = LandauClassifier.LandauClassifier( dogauss=True )
    for histname in histstruct.histnames:
        classifiers[histname] = classifier

if classifymethod == 'moments':

    for histname in histstruct.histnames:
        print('calculating moments for histogram type {}'.format(histname))
        hists = histstruct.get_histograms(histname=histname, masknames=['dcson','highstat'])
        classifier = MomentClassifier.MomentClassifier( orders=[1,2] )
        classifier.train(hists)
        classifiers[histname] = classifier

model.set_classifiers(classifiers)
histstruct.add_model( modelname, model )
building NMF model for histogram type chargeInner_PXLayer_2
size of training set: (50000, 100)


/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/sklearn/decomposition/_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/sklearn/decomposition/_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"

png

building NMF model for histogram type chargeInner_PXLayer_3
size of training set: (50000, 100)


/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/sklearn/decomposition/_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/sklearn/decomposition/_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"

png

adding model "nmf" to the HistStruct
### evaluate the models on all histograms in the (non-extended) histstruct and on the partitions

# evaluate classifiers
print('evaluating classifiers')
histstruct.evaluate_classifiers( modelname )
histstruct.evaluate_classifiers( modelname, setnames=['partitions'] )
evaluating classifiers
### plot the multidemensional mse and fit a distribution

# set method and other properties
doplot = True

# initializations 
dimslist = []
nhisttypes = len(histstruct.histnames)
for i in range(0,nhisttypes-1):
    for j in range(i+1,nhisttypes):
        dimslist.append((i,j))
plt.close('all')

# define training masks
training_masks = ['dcson','highstat']

# train the actual fitter
histstruct.set_fitter( modelname, SeminormalFitter.SeminormalFitter() )
histstruct.train_fitter( modelname, masknames=training_masks )
histstruct.evaluate_fitter( modelname )

# train and plot the partial fitters
if doplot:

    histstruct.train_partial_fitters( modelname, dimslist, masknames=training_masks )
    for dims in dimslist:
            fig,ax = histstruct.plot_partial_fit( modelname, dims, 
                    clusters=[{'masknames':training_masks}],
                    colors=['b'],
                    labels=['Training lumisections'],
                    logprob=True, clipprob=True,
                    xlims=30, ylims=30, 
                    onlypositive=True, transparency=0.5,
                    xaxtitle=pu.make_text_latex_safe(histstruct.histnames[dims[0]])+' MSE', xaxtitlesize=12,
                    yaxtitle=pu.make_text_latex_safe(histstruct.histnames[dims[1]])+' MSE', yaxtitlesize=12,
                    caxtitle='Logarithm of probability density', caxtitlesize=12)
            pu.add_cms_label( ax, pos=(0.05,0.9), extratext='Preliminary', fontsize=12, background_alpha=0.75 )
            pu.add_text( ax, 'Density fit of lumisection MSE', (0.05,0.8), fontsize=12, background_alpha=0.75 )
            pu.add_text( ax, '2017 (13 TeV)', (0.73,1.01), fontsize=12 )
            #plt.close('all') # release plot memory
/eos/home-l/llambrec/SWAN_projects/ML4DQM-DC/tutorials/../utils/plot_utils.py:904: RuntimeWarning: divide by zero encountered in log
  if logprob: evalpoints = np.log(evalpoints)


NOTE: scores of -inf were reset to -745.4400719213812

png

### extend the test set using artificial data generation and evaluate the model on the extended test set

# make the extra data
trainingmasknames = ['dcson','highstat']
for histname in histstruct.histnames:
    print('generating data for '+histname)
    # good data option 1: resample partitions
    #goodhists = histstruct.get_histograms( setnames=['partitions'], histname=histname )
    #(goodexthists,_,_) = gdu.upsample_hist_set( goodhists, ntarget=nbadruns*5e3, fourierstdfactor=20., doplot=False)
    # good data option 2: use training set
    goodexthists = histstruct.get_histograms( histname=histname, masknames=trainingmasknames )
    histstruct.add_exthistograms( 'test_good_ext', histname, goodexthists, overwrite=True )
    for badrun in badrunsls.keys():
        badhists = histstruct.get_histograms( histname=histname,masknames=['dcson','highstat','bad{}'.format(badrun)] )
        (badexthists,_,_) = gdu.upsample_hist_set( badhists, ntarget=5e3, fourierstdfactor=20., doplot=False)
        histstruct.add_exthistograms( 'test_bad{}_ext'.format(badrun), histname, badexthists, overwrite=True )

# evaluate the classifiers
print('evaluating classifiers')
histstruct.evaluate_classifiers( modelname, setnames=['test_good_ext'] )
# shortcut if using training set as good test set: copy scores
#histstruct.models[modelname].scores['test_good_ext'] = histstruct.get_scores( modelname, masknames=trainingmasknames )
for badrun in badrunsls.keys():
    histstruct.evaluate_classifiers( modelname, setnames=['test_bad{}_ext'.format(badrun)])

# evaluate the fitter
print('evaluating fitter')
histstruct.evaluate_fitter( modelname, setnames=['test_good_ext'] )
for badrun in badrunsls.keys():
    histstruct.evaluate_fitter( modelname, setnames=['test_bad{}_ext'.format(badrun)] )
generating data for chargeInner_PXLayer_2
generating data for chargeInner_PXLayer_3
evaluating classifiers
evaluating fitter
### (re-)define the test set

badrunstokeep = [
                "297287",
                "297288",
                "297289",
                "299316",
                "299317",
                "299318",
                "299324",
                "299325",
                "299326",
                "300373",
                "300374",
                "300397",
                "300398"
                ]

# make a new mask that is the union of the selected bad runs
histstruct.remove_mask( 'badselected' )
histstruct.add_mask( 'badselected', histstruct.get_union_mask(['bad{}'.format(badrun) for badrun in badrunstokeep]) )

# training set
trainset_args = {'masknames': training_masks}
goodset_args = {'setnames': ['test_good_ext']}
badset_args = {'setnames': ['test_bad{}_ext'.format(badrun) for badrun in badrunstokeep]}
badset_parts_args = []
for badrun in badrunstokeep: 
    badset_parts_args.append( {'setnames': ['test_bad{}_ext'.format(badrun)]} )

# get the log probability for good set
prob_good = histstruct.get_globalscores( modelname, **goodset_args )
logprob_good = np.log(prob_good)
# get the log probability for bad set
prob_bad = histstruct.get_globalscores( modelname, **badset_args )
logprob_bad = np.log(prob_bad)

print('--- good lumesections ---')
print('length of log prob array: '+str(len(logprob_good)))
print('minimum of log prob: '+str(np.min(logprob_good)))
print('--- bad lumisections ---')
print('length of log prob array: '+str(len(logprob_bad)))
print('maximum of log prob: '+str(np.max(logprob_bad)))
WARNING in HistStruct.remove_mask: name badselected is not in list of masks...
--- good lumesections ---
length of log prob array: 211171
minimum of log prob: -inf
--- bad lumisections ---
length of log prob array: 64814
maximum of log prob: 18.192293308278305


/tmp/ipykernel_27081/3315064921.py:33: RuntimeWarning: divide by zero encountered in log
  logprob_good = np.log(prob_good)
/tmp/ipykernel_27081/3315064921.py:36: RuntimeWarning: divide by zero encountered in log
  logprob_bad = np.log(prob_bad)
### make 1D score (MSE) distributions for all histogram types

for i,histname in enumerate(histstruct.histnames):

    fig,axs = plt.subplots(ncols=3, figsize=(15,4))
    scores_train = histstruct.get_scores( modelname, histname=histname, **trainset_args )
    scores_good = histstruct.get_scores( modelname, histname=histname, **goodset_args )
    scores_bad = histstruct.get_scores( modelname, histname=histname, **badset_args )

    # plot training and good set
    scores = np.concatenate((scores_train,scores_good))
    labels = np.concatenate((np.ones(len(scores_train)),np.zeros(len(scores_good))))
    pu.plot_score_dist( scores, labels,
                        fig=fig, ax=axs[0],
                        nbins=100, normalize=True,
                        siglabel='Training set', sigcolor='b',
                        bcklabel='Good test set', bckcolor='g',
                        title=None, 
                        xaxtitle='Mean squared error', xaxtitlesize=14,
                        yaxtitle='Number of lumisections (normalized)', yaxtitlesize=14,
                        legendsize=15,
                        doshow=False)
    pu.add_cms_label( axs[0], pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
    pu.add_text( axs[0], pu.make_text_latex_safe(histname), (0.95,0.65), fontsize=15, horizontalalignment='right' )
    pu.add_text( axs[0], '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )

    # plot training and bad set
    scores = np.concatenate((scores_train,scores_bad))
    labels = np.concatenate((np.ones(len(scores_train)),np.zeros(len(scores_bad))))
    pu.plot_score_dist( scores, labels, 
                        fig=fig, ax=axs[1],
                        nbins=100, normalize=True,
                        siglabel='Training set', sigcolor='b',
                        bcklabel='Bad test set', bckcolor='r',
                        title=None, 
                        xaxtitle='Mean squared error', xaxtitlesize=14,
                        yaxtitle='Number of lumisections (normalized)', yaxtitlesize=14,
                        legendsize=15,
                        doshow=False)
    pu.add_cms_label( axs[1], pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
    pu.add_text( axs[1], pu.make_text_latex_safe(histname), (0.95,0.65), fontsize=15, horizontalalignment='right' )
    pu.add_text( axs[1], '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )

    # plot good and bad set
    scores = np.concatenate((scores_good,scores_bad))
    labels = np.concatenate((np.ones(len(scores_good)),np.zeros(len(scores_bad))))
    pu.plot_score_dist(scores, labels, fig=fig, ax=axs[2],
                        nbins=100, normalize=True,
                        siglabel='Good test set', sigcolor='g',
                        bcklabel='Bad test set', bckcolor='r',
                        title=None, 
                        xaxtitle='Mean squared error', xaxtitlesize=14,
                        yaxtitle='Number of lumisections (normalized)', yaxtitlesize=14,
                        legendsize=15,
                        doshow=False)
    pu.add_cms_label( axs[2], pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
    pu.add_text( axs[2], pu.make_text_latex_safe(histname), (0.95,0.65), fontsize=15, horizontalalignment='right' )
    pu.add_text( axs[2], '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )

    fig.subplots_adjust(wspace=0.3, hspace=0.3)

png

png

### make a new plot of probability contours and overlay data points 

doplot = True

if doplot:

    # initializations
    plt.close('all')
    colormap = mpl.cm.get_cmap('Reds')
    colorlist = [colormap(i) for i in np.linspace(0.3,0.9,num=len(badrunstokeep))]
    if len(colorlist)<len(badrunstokeep):
        raise Exception('ERROR: need more colors...')

    clusters = []
    labels = []
    colors = []
    # add good set
    clusters.append(goodset_args)
    colors.append('green')
    labels.append('Averages of training set')
    # add bad sets
    for j,run in enumerate(badrunstokeep): 
        clusters.append(badset_parts_args[j])
        labels.append('Run {}'.format(run))
        colors.append(colorlist[j])

    # make the plots
    for dims in dimslist:
        fig,ax = histstruct.plot_partial_fit( modelname, dims, 
                    clusters=clusters,
                    colors=colors,
                    labels=labels,
                    logprob=True, clipprob=True,
                    xlims=30, ylims=30, 
                    onlypositive=True, transparency=0.5,
                    xaxtitle=pu.make_text_latex_safe(histstruct.histnames[dims[0]])+' MSE', xaxtitlesize=12,
                    yaxtitle=pu.make_text_latex_safe(histstruct.histnames[dims[1]])+' MSE', yaxtitlesize=12,
                    caxtitle='Logarithm of probability density', caxtitlesize=12)
        pu.add_cms_label( ax, pos=(0.05,0.9), extratext='Preliminary', fontsize=12, background_alpha=0.75 )
        pu.add_text( ax, 'Density fit of lumisection MSE', (0.05,0.8), fontsize=12, background_alpha=0.75 )
        pu.add_text( ax, '2017 (13 TeV)', (0.73,1.01), fontsize=12 )
        #plt.close('all') # release plot memory
/eos/home-l/llambrec/SWAN_projects/ML4DQM-DC/tutorials/../utils/plot_utils.py:904: RuntimeWarning: divide by zero encountered in log
  if logprob: evalpoints = np.log(evalpoints)


NOTE: scores of -inf were reset to -745.4400719213812

png

### make a roc curve based on the test results above

labels_good = np.zeros(len(logprob_good)) # background: label = 0
labels_bad = np.ones(len(logprob_bad)) # signal: label = 1

labels = np.concatenate((labels_good,labels_bad))
scores = np.concatenate((-logprob_good,-logprob_bad))
scores = aeu.clip_scores( scores )

# plot score distribution
fig,ax = pu.plot_score_dist(scores, labels, 
                   siglabel='Anomalous', sigcolor='r', 
                   bcklabel='Good', bckcolor='g', 
                   nbins=250, normalize=True,
                   xaxtitle='Negative logarithmic probability', xaxtitlesize=15,
                   yaxtitle='Number of lumisections (normalized)', yaxtitlesize=15,
                   legendsize=15,
                   doshow=False)
ax.set_yscale('log')
pu.add_cms_label( ax, pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
pu.add_text( ax, 'Combined model output score', (0.1,0.65), fontsize=15 )
pu.add_text( ax, '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )
plt.show()

# plot ROC curve
(auc,sigeff,bkgeff,sigeff_unc) = aeu.get_roc(scores, labels, 
                                             mode='geom', doprint=False,
                                             bootstrap_samples=100,
                                             doplot=False, returneffs=True)
fig,ax = pu.plot_roc( sigeff, bkgeff, sig_eff_unc=sigeff_unc,
                      xaxtitle='False anomaly rate', xaxtitlesize=15,
                      yaxtitle='True anomaly efficiency', yaxtitlesize=15, doshow=False
                       )
auctext = '{:.3f}'.format(auc)
if auc>0.99: auctext = '1 - '+'{:.3e}'.format(1-auc)
pu.add_cms_label( ax, pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
pu.add_text( ax, '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )
pu.add_text( ax, 'Combined model ROC curve', (0.95,0.25), fontsize=15, horizontalalignment='right' )
pu.add_text( ax, 'AUC: {}'.format(auctext), (0.95,0.15), fontsize=15, horizontalalignment='right' )
plt.show()

#wp = aeu.get_confusion_matrix(scores, labels, wp='maxauc')
#wp = aeu.get_confusion_matrix(scores, labels, wp=0)
#print('working point: {}'.format(wp))
NOTE: scores of +inf were reset to 745.4400719213812

png

calculating ROC curve on 100 bootstrap samples of size 275985

png

### investigate particular lumisections

# initialization: general
mode = 'ls'
run = 299316
# for mode 'ls' (ignored if mode is 'run'):
ls = 70
# for mode 'run' (ignored if mode is 'ls'):
run_masknames = ['dcson','highstat']

# initialization: reference scores
plot_refscores = True
refscore_masknames = ['dcson','highstat']

# initialization: reference histograms
refhists = {}
for histname in histstruct.histnames: 
    refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['highstat','dcson']), 50 )

if mode=='ls':
    # plot this particular run/ls
    fig,axs = histstruct.plot_ls( run, ls, recohist=None, refhists=refhists, 
                                opaque_legend=True,
                                titledict = titledict, titlesize=15,
                                ncols = 3,
                                physicalxax = True,
                                xaxtitledict = xaxtitledict, xaxtitlesize=17,
                                yaxtitledict = yaxtitledict, yaxtitlesize=17,
                                ymaxfactor = 1.3,
                                legendsize = 13
                              )
    # stylistic modifications
    counter = -1
    for i in range(axs.shape[0]): 
        for j in range(axs.shape[1]):
            counter += 1
            if counter>=len(histstruct.histnames): break
            ax = axs[i,j]
            pu.add_cms_label( ax, pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
            pu.add_text( ax, extratextdict[histstruct.histnames[counter]], (0.95,0.6), fontsize=15, horizontalalignment='right' )
            pu.add_text( ax, '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )
    fig.subplots_adjust(wspace=0.3, hspace=0.3)
    plt.show()

    # only for moment classifier: do some prints
    #hist = histstruct.histograms['chargeInner_PXLayer_2'][histstruct.get_index(run,ls)]
    #histstruct.classifiers['chargeInner_PXLayer_2'].printout(hist)

    # print the mses
    msepoint = histstruct.get_scores_ls( modelname, run, ls )
    logprob = np.log( histstruct.evaluate_fitter_on_point( modelname, msepoint ) )
    print('-------------')
    print('MSE values:')
    for histname in histstruct.histnames: print('{} : {}'.format(histname,msepoint[histname]))
    print('-------------')
    print('logprob: '+str(logprob))

    # plot mse distribution
    if plot_refscores:
        fig,axs = histstruct.plot_ls_score( modelname, run, ls, masknames=refscore_masknames, 
                        nbins=100, normalize=True,
                        siglabel='This lumisection', bcklabel='All lumisections',
                        sigcolor='k', bckcolor='b',
                        title=None, 
                        xaxtitle='MSE', xaxtitlesize=15,
                        yaxtitle='Normalized number of lumisections', yaxtitlesize=15,
                        doshow=False)
        # stylistic modifications
        counter = -1
        for i in range(axs.shape[0]): 
            for j in range(axs.shape[1]):
                counter += 1
                if counter>=len(histstruct.histnames): break
                ax = axs[i,j]
                pu.add_cms_label( ax, pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
                pu.add_text( ax, extratextdict[histstruct.histnames[counter]], (0.95,0.6), fontsize=15, horizontalalignment='right' )
                pu.add_text( ax, '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )
        fig.subplots_adjust(wspace=0.3, hspace=0.3)


if mode=='run':
    # plot given run
    runnbs = histstruct.get_runnbs( masknames=run_masknames )
    lsnbs = histstruct.get_lsnbs( masknames=run_masknames )
    runsel = np.where(runnbs==run)
    lsnbs = lsnbs[runsel]
    print('plotting {} lumisections...'.format(len(lsnbs)))
    for lsnb in lsnbs:
        fig,ax = histstruct.plot_ls(run, lsnb, recohist=None, refhists=refhists, opaque_legend=True )
        plt.show()
        msepoint = histstruct.get_scores_ls( modelname, run, lsnb )
        msepointarray = np.array([msepoint[histname] for histname in histstruct.histnames])
        logprob = np.log( histstruct.evaluate_fitter_on_point( modelname, msepoint ) )
        print('-------------')
        print('MSE values:')
        for histname in histstruct.histnames: print('{} : {}'.format(histname,msepoint[histname]))
        print('-------------')
        print('logprob: '+str(logprob))

png

-------------
MSE values:
chargeInner_PXLayer_2 : [0.00027372]
chargeInner_PXLayer_3 : [4.50905996e-07]
-------------
logprob: 13.075970512248668

png

### investigate how the method performs on the golden/custom test set

# choose masks for evaluation set
masks_eval = ['golden', 'highstat']
# set logprob boundaries
logup = -100
logdown = None
# set whether to do plotting
doplot = True
nplotsmax = 10
# set properties of file to save
dosave = False
pdfname = ''

# get the lumisections within chosen logprob range
score_up = np.exp(logup) if logup is not None else None
score_down = np.exp(logdown) if logdown is not None else None
runsls_eval = len(histstruct.get_runnbs(masknames=masks_eval))
runsls_in_range = histstruct.get_globalscores_runsls( modelname, masknames=masks_eval,
                                                      score_up = score_up, score_down = score_down )
runnbs_in_range = runsls_in_range[0]
lsnbs_in_range = runsls_in_range[1]
print('{} out of {} LS are within these boundaries'.format(len(runnbs_in_range),runsls_eval))

if doplot:

    # define reference histograms
    refhists = {}
    for histname in histstruct.histnames:
        refhists[histname] = hu.averagehists( histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] ), 25 )

    # make plots
    from matplotlib.backends.backend_pdf import PdfPages
    if dosave: pdf = PdfPages(pdfname)
    for i,(runnb,lsnb) in enumerate(zip(runnbs_in_range, lsnbs_in_range)):
        if i>=nplotsmax:
            print('maximum number of plots reached')
            break
        print('------------------------')
        # plot this particular run/ls
        fig,axs = histstruct.plot_ls( runnb, lsnb, recohist=None, refhists=refhists, opaque_legend=True,
                                    titledict = titledict, titlesize=15,
                                    ncols = 3,
                                    physicalxax = True,
                                    xaxtitledict = xaxtitledict, xaxtitlesize=17,
                                    yaxtitledict = yaxtitledict, yaxtitlesize=17,
                                    ymaxfactor = 1.3,
                                    legendsize = 13
                                    )
        # stylistic modifications
        counter = -1
        for i in range(axs.shape[0]): 
            for j in range(axs.shape[1]):
                counter += 1
                if counter>=len(histstruct.histnames): break
                ax = axs[i,j]
                pu.add_cms_label( ax, pos=(0.02,1.01), extratext='Preliminary', fontsize=16 )
                pu.add_text( ax, extratextdict[histstruct.histnames[counter]], (0.95,0.6), fontsize=15, horizontalalignment='right' )
                pu.add_text( ax, '2017 (13 TeV)', (1,1.01), fontsize=14, horizontalalignment='right' )
        fig.subplots_adjust(wspace=0.3, hspace=0.3)
        plt.show()
        if dosave: pdf.savefig(fig)
        msepoint = histstruct.get_scores_ls( modelname, runnb, lsnb )
        msepointarray = np.array([msepoint[histname] for histname in histstruct.histnames])
        logprob = np.log( histstruct.evaluate_fitter_on_point( modelname, msepoint ) )
        print('-------------')
        print('MSE values:')
        for histname in histstruct.histnames: print('{} : {}'.format(histname,msepoint[histname]))
        print('-------------')
        print('logprob: '+str(logprob))
    if dosave: pdf.close()
0 out of 201393 LS are within these boundaries