Exploration of data generation ('resampling') methods

This notebook lists and plots some of the implemented methods for resampling histograms in order to artificially increase the statistics of training or testing sets.

### imports

# external modules
import sys
import os
import matplotlib.pyplot as plt

# local modules
import hist_utils as hu
import dataframe_utils as dfu
import generate_data_utils as gdu
import plot_utils as pu
import DataLoader
### load the data
# note: this cell assumes you have a csv file stored at the specified location,
#       containing only histograms of the specified type;
#       see the tutorial read_and_write_data for examples on how to create such files!

histname = 'chargeInner_PXLayer_2'
filename = 'DF2017_'+histname+'.csv'
datadir = '../data'

dloader = DataLoader.DataLoader()
df = dloader.get_dataframe_from_file( os.path.join(datadir, filename) )
print('raw input data shape: {}'.format( dfu.get_hist_values(df)[0].shape ))
allhists = hu.preparedatafromdf(df,donormalize=True)

# note: depending on which histogram you are looking at, the 'good' and 'bad' runs defined below might not be good or bad at all!
#       you will need to find a set of clearly good and bad runs for you type(s) of histogram.
goodrunsls = {'2017':

badrunsls = {'2017':

goodhists = hu.preparedatafromdf(dfu.select_runsls(df,goodrunsls['2017']),donormalize=True)
badhists = hu.preparedatafromdf(dfu.select_runsls(df,badrunsls['2017']),donormalize=True)

# plot some together
pu.plot_sets([goodhists,badhists],colorlist=['b','r'],labellist=['"good" histograms','"bad" histograms'])
INFO in DataLoader.get_dataframe_from_file: loading dataframe from file ../data/DF2017_chargeInner_PXLayer_2.csv...
INFO in DataLoader.get_dataframe_from_file: sorting the dataframe...
INFO in DataLoader.get_dataframe_from_file: loaded a dataframe with 225954 rows and 16 columns.
raw input data shape: (225954, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)


### also select a seed

seed = dfu.select_runsls(df,{"297056":[[100,100]]})
run = dfu.select_runs(df,[297056])
seedhist = hu.preparedatafromdf(seed,donormalize=True)
runhists = hu.preparedatafromdf(run,donormalize=True)

# plot some together
pu.plot_sets([runhists,seedhist],colorlist=['lightblue','k'],labellist=['histograms','seed histogram'])
(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)


### testing section for fourier_noise_on_mean

(reshists,_,_) = gdu.fourier_noise_on_mean(allhists, nresamples=10, nonnegative=True, doplot=True)
print('size of original set: {}'.format(allhists.shape))
print('size of resampled set: {}'.format(reshists.shape))
pu.plot_sets([hu.select_random(allhists, nselect=3), hu.select_random(reshists, nselect=3)],
             labellist=['original histograms','resampled histograms'],
size of original set: (225954, 102)
size of resampled set: (10, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)



### testing section for fourier_noise

(greshists,_,_) = gdu.fourier_noise(goodhists, nresamples=10, nonnegative=True, doplot=True)
(breshists,_,_) = gdu.fourier_noise(badhists, nresamples=9, nonnegative=True, stdfactor=3., doplot=True)
print('size of resampled good set: {}'.format(greshists.shape))
print('size of resampled bad set: {}'.format(breshists.shape))
pu.plot_sets([hu.select_random(greshists, nselect=100), hu.select_random(breshists, nselect=100)],
             labellist=['resampled good histograms','resampled bad histograms'],
size of resampled good set: (7380, 102)
size of resampled bad set: (1449, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)




### testing section for resample_bin_per_bin

(reshists,_,_) = gdu.resample_bin_per_bin(allhists, nresamples=10, nonnegative=True, smoothinghalfwidth=0, doplot=True)
print('size of original set: {}'.format(allhists.shape))
print('size of resampled set: {}'.format(reshists.shape))
pu.plot_sets([hu.select_random(allhists, nselect=3), hu.select_random(reshists, nselect=3)],
             labellist=['original histograms','resampled histograms'],
size of original set: (225954, 102)
size of resampled set: (10, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)



### testing section for resample_similar_bin_per_bin

(greshists,_,_) = gdu.resample_similar_bin_per_bin(allhists, goodhists, nresamples=3, nonnegative=True, keeppercentage=0.005, doplot=True)
(breshists,_,_) = gdu.resample_similar_bin_per_bin(allhists, badhists, nresamples=3, nonnegative=True, keeppercentage=0.003, doplot=True)
print('size of resampled good set: {}'.format(greshists.shape))
print('size of resampled bad set: {}'.format(breshists.shape))
pu.plot_sets([hu.select_random(greshists, nselect=100), hu.select_random(breshists, nselect=100)],
             labellist=['resampled good histograms','resampled bad histograms'],
Note: bin-per-bin resampling performed on 12 histograms.
If this number is too low, existing histograms are drawn with too small variation.
If this number is too high, systematic shifts of histograms can be averaged out.
Note: bin-per-bin resampling performed on 7 histograms.
If this number is too low, existing histograms are drawn with too small variation.
If this number is too high, systematic shifts of histograms can be averaged out.
size of resampled good set: (2214, 102)
size of resampled bad set: (483, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)




### testing section for resample_similar_fourier_noise

(greshists,_,_) = gdu.resample_similar_fourier_noise(allhists, goodhists, nresamples=3, nonnegative=True, keeppercentage=0.001, doplot=True)
(breshists,_,_) = gdu.resample_similar_fourier_noise(allhists, badhists, nresamples=3, nonnegative=True, keeppercentage=0.001, doplot=True)
print('size of resampled good set: {}'.format(greshists.shape))
print('size of resampled bad set: {}'.format(breshists.shape))
pu.plot_sets([hu.select_random(greshists, nselect=100), hu.select_random(breshists, nselect=100)],
             labellist=['resampled good histograms','resampled bad histograms'],
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3440: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/_methods.py:181: RuntimeWarning: invalid value encountered in true_divide
  ret = um.true_divide(
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/_methods.py:262: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/_methods.py:222: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/_methods.py:251: RuntimeWarning: invalid value encountered in true_divide
  ret = um.true_divide(

Note: mean and std calculation is performed on 3 histograms.
If this number is too low, histograms might be too similar for averaging to have effect.
If this number is too high, systematic shifts of histogram shapes are included into the averaging.

/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3440: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/_methods.py:181: RuntimeWarning: invalid value encountered in true_divide
  ret = um.true_divide(
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/_methods.py:262: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/_methods.py:222: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
/cvmfs/sft.cern.ch/lcg/views/LCG_101swan/x86_64-centos7-gcc8-opt/lib/python3.9/site-packages/numpy/core/_methods.py:251: RuntimeWarning: invalid value encountered in true_divide
  ret = um.true_divide(

Note: mean and std calculation is performed on 3 histograms.
If this number is too low, histograms might be too similar for averaging to have effect.
If this number is too high, systematic shifts of histogram shapes are included into the averaging.
size of resampled good set: (2214, 102)
size of resampled bad set: (483, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)




### testing section for resample_similar_lico

(greshists,_,_) = gdu.resample_similar_lico(allhists,goodhists,nresamples=10,nonnegative=True,keeppercentage=0.1, doplot=True)
(breshists,_,_) = gdu.resample_similar_lico(allhists,badhists,nresamples=1,nonnegative=False,keeppercentage=0.001, doplot=True)
print('size of resampled good set: {}'.format(greshists.shape))
print('size of resampled bad set: {}'.format(breshists.shape))
pu.plot_sets([hu.select_random(greshists, nselect=100), hu.select_random(breshists, nselect=100)],
             labellist=['resampled good histograms','resampled bad histograms'],
Note: linear combination is taken between 226 histograms.
If this number is too low, histograms might be too similar for combination to have effect.
If this number is too high, systematic shifts of histogram shapes are included into the combination
Note: linear combination is taken between 3 histograms.
If this number is too low, histograms might be too similar for combination to have effect.
If this number is too high, systematic shifts of histogram shapes are included into the combination
size of resampled good set: (7380, 102)
size of resampled bad set: (161, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)




### testing section for mc_sampling

(reshists,_,_) = gdu.mc_sampling(seedhist, nresamples=10, nMC=10000, doplot=True)
print('size of resampled set: {}'.format(reshists.shape))
pu.plot_sets([seedhist, hu.select_random(reshists, nselect=3)],
             labellist=['original histogram','resampled histograms'],
size of resampled set: (10, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)



### testing section for white_noise

(greshists,_,_) = gdu.white_noise(goodhists, stdfactor=15, doplot=True)
(breshists,_,_) = gdu.white_noise(badhists, stdfactor=3., doplot=True)
print('size of resampled good set: {}'.format(greshists.shape))
print('size of resampled bad set: {}'.format(breshists.shape))
pu.plot_sets([hu.select_random(greshists, nselect=100), hu.select_random(breshists, nselect=100)],
             labellist=['resampled good histograms','resampled bad histograms'],
size of resampled good set: (738, 102)
size of resampled bad set: (161, 102)

(<Figure size 432x288 with 1 Axes>, <AxesSubplot:>)


