lib/SessionData.js - Documentation

'use strict';
import * as TENSOR_FLOW from '@tensorflow/tfjs-node';
import * as Utils from './Utils';
//TODO: PERF: This class will get a memory-hit upgrade. It carries duplicates of
//            the inputs, because most runs will need them as both TF tensors
//            and raw arrays; problematic under a bad RAM:data ratio.
//
//            TLDR: We will save memory in exchange for (potentially
//            significant) CPU hits.
//
//            Details:
//            In some usage cases the array versions aren't required (e.g. the
//            user does not use standardization). Further, even if the arrays
//            _are_ required, we don't need to store them here. We can use TF's
//            Tensor.arraySync() to reproduce the data in array form only while
//            it's needed. Being more uptight, we could throw out the data when
//            not needed, then re-read it from (temp) local datafiles, never
//            making both share memory, except at startup.
//            Convert this to an abstract base, and write concrete versions for
//            the user's desired mem/speed balance:
//
//            SessionData
//              > SessionDataStandardized
//              > SessionDataStandardizedFaster
//              > SessionDataStandardizedSmaller
/**
 * Manages the data set used to train and test models during the grid search.
 */
class SessionData {
    /**
     * Creates an instance of SessionData.
     * @param {number} proofPercentage A value 0-1 exclusive used to determine
     *	the number of cases reserved for generalization testing. These cases
     *	are never seen by the model during training.<br>
     *	A common value is 0.2 (20%).
     * @param {DataSet} dataSet The data used to train and test models.
     * @param {boolean} _useDefaultStandardization If true, the input values
     *	will be modified internally such that each feature has a mean of zero
     *	and a variance of one.<br>
     *	If standardization callbacks are supplied, this argument is ignored.
     * @param {function} [_callbackStandardize] A function invoked with the
     *	input data prior to the grid search. It provides an opportunity to
     *	preprocess the data internally before it's transformed into tensors.<br>
     *  <b>Arguments:</b> unstandardizedInputs: Array<unknown><br>
     *  <b>Returns:</b> void
     */
    constructor(proofPercentage, dataSet, _useDefaultStandardization, _callbackStandardize) {
        this._useDefaultStandardization = _useDefaultStandardization;
        this._callbackStandardize = _callbackStandardize;
        this._totalInputNeurons = 0;
        this._totalOutputNeurons = 0;
        this._totalTrainingCases = 0;
        Utils.Assert(proofPercentage > 0.0);
        Utils.Assert(proofPercentage < 1.0);
        const rawInputs = dataSet.inputs;
        const rawTargets = dataSet.targets;
        SessionData.ValidateRawData(rawInputs);
        SessionData.ValidateRawData(rawTargets);
        this._totalInputNeurons = CountLeafElements(rawInputs);
        this._totalOutputNeurons = CountLeafElements(rawTargets);
        console.log('inputs: ' + this._totalInputNeurons);
        console.log('outputs: ' + this._totalOutputNeurons);
        // create a clone of these inputs pre-standardization, to be used
        // (potentially) for human-friendly reporting
        this._rawInputsTraining = JSON.parse(JSON.stringify(rawInputs));
        //NOTE: This call validates and sets the callback members, as needed.
        this.SetupStandardization();
        //NOTE: TODO: We don't standardize targets, yet, although that will be desired
        //            for regression networks. When we make that change, support it with
        //            a default and optional callbacks.
        if (this._callbackStandardize) {
            this._callbackStandardize(rawInputs);
        }
        else if (this._useDefaultStandardization) {
            // standardize (to mean zero, variance one)
            StandardizeInputs(rawInputs);
        }
        // move a portion of the cases into a 'proof' set, to be used after
        // training to measure generalization
        const TOTAL_CASES = rawInputs.length;
        const PROOF_COUNT = Math.round(TOTAL_CASES * proofPercentage);
        console.log('total cases: ' + TOTAL_CASES + ', with ' + PROOF_COUNT
            + ' reserved for generalization tests');
        if (PROOF_COUNT < 1) {
            throw new Error('The provided proofPercentage is too low. Zero '
                + 'cases moved from the training set.');
        }
        if (PROOF_COUNT >= TOTAL_CASES) {
            throw new Error('The provided proofPercentage is too high. 100% of '
                + 'cases moved from the training set.');
        }
        //NOTE: "unknown" feels like a copout, but there is no other (even remotely
        //      clean) way to inform the upcoming array shift/push calls. This would
        //      have to be initialized to the depth of the user input, which is unknown
        //      (ha!) at compile time. I could measure it outsie, and create a generic
        //      class/interface of some kind (in lieu of six differnet support objects
        //      (at least), and gobs of duplication).
        //TODO: (low-pri, but good exercise) Look into a generic class,
        //      e.g. DeepTrainingData::Array<T>.
        const PROOF_INPUTS = [];
        const PROOF_TARGETS = [];
        // we also carry a copy of the proof subset, in its original,
        // unstandardized form
        //NOTE: Cases are migrated from _rawInputsTraining, so that afterward the
        //      standardized and raw collections match, i.e. both of these are true:
        //          PROOF_INPUTS.length === _rawInputsProof.length
        //          rawInputs.length === _rawInputsTraining.length
        this._rawInputsProof = [];
        for (let i = 0; i < PROOF_COUNT; ++i) {
            /* istanbul ignore next */ //[FUTURE PROOFING]
            if (rawInputs.length === 0) {
                throw new Error('Inputs array emptied prematurely');
            }
            /* istanbul ignore next */ //[FUTURE PROOFING]
            if (rawTargets.length === 0) {
                throw new Error('Targets array emptied prematurely');
            }
            //NOTE: This assign-first-then-shift approach is not ideal, and it's only done
            //      as a workaround to the conversion problems I had w/ TFNestedArray;
            //      possibly alleviated by the Array<unknown> change, but that's also an
            //      unacceptable solution, long term.
            PROOF_INPUTS[i] = rawInputs[0];
            PROOF_TARGETS[i] = rawTargets[0];
            rawInputs.shift();
            rawTargets.shift();
            this._rawInputsProof[i] = this._rawInputsTraining[0];
            this._rawInputsTraining.shift();
        }
        // store the targets of the cases we separated from the training set
        this._proofTargets = PROOF_TARGETS;
        // convert the proof inputs to tensors, for the post-training prediction
        // step
        this._proofInputsTensor = TENSOR_FLOW.tidy(() => {
            return TENSOR_FLOW.tensor(PROOF_INPUTS);
        });
        // convert the training data to tensors, for the model-fit step
        this._trainingInputsTensor = TENSOR_FLOW.tidy(() => {
            return TENSOR_FLOW.tensor(rawInputs);
        });
        this._trainingTargetsTensor = TENSOR_FLOW.tidy(() => {
            return TENSOR_FLOW.tensor(rawTargets);
        });
        this._totalTrainingCases = rawInputs.length;
    }
    get proofInputsTensor() { return this._proofInputsTensor; }
    get proofTargets() { return this._proofTargets; }
    get rawInputsProof() { return this._rawInputsProof; }
    get totalInputNeurons() { return this._totalInputNeurons; }
    get totalOutputNeurons() { return this._totalOutputNeurons; }
    get totalTrainingCases() { return this._totalTrainingCases; }
    get trainingInputsTensor() { return this._trainingInputsTensor; }
    get trainingTargetsTensor() { return this._trainingTargetsTensor; }
    /**
     * Determines the standardization scheme to be used.
     * @private
     */
    SetupStandardization() {
        if (!this._callbackStandardize) {
            // no callback; useDefaultStandardization will drive the behavior
            return;
        }
        // if the arguments indicate both standardization techniques (stock and
        // custom), we use custom, i.e. the user's callback
        if (this._useDefaultStandardization) {
            console.warn('Standardization callbacks supplied, so default '
                + 'standardization will be ignored.');
            this._useDefaultStandardization = false;
        }
    }
    /**
     * Throws unless the input data is comprised of arrays of numbers, only.
     * The arrays may be nested.
     * Note that we do <i>not</i> enforce full tensor validity. TF will happily
     * throw on invalid data. This is a quick step to catch more obvious issues
     * before training/testing, and communicate them in a more friendly manner.
     * @private
     * @static
     * @param {TFNestedArray} raw
     */
    static ValidateRawData(raw) {
        const CHECK_ARRAYS_OF_NUMBERS_RECURSIVELY = (a) => {
            if (Array.isArray(a)) {
                if (a.length < 1) {
                    console.warn('bad empty array', a);
                    return false;
                }
                for (let i = 0; i < a.length; ++i) {
                    if (CHECK_ARRAYS_OF_NUMBERS_RECURSIVELY(a[i])) {
                        continue;
                    }
                    return false;
                }
                return true; // PASS as Array
            }
            if (typeof a === 'number') {
                return true; // PASS as Number
            }
            console.warn('bad type: ' + (typeof a) + '; (requires number or '
                + 'array)');
            return false;
        };
        if (CHECK_ARRAYS_OF_NUMBERS_RECURSIVELY(raw)) {
            return;
        }
        throw new Error('Invalid raw data. Inputs and targets must be supplied '
            + 'as arrays of numbers, flat or nested.');
    }
}
//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
//TODO: This standardization code moves into a separate lib, and/or gets replaced by simple-statistics(tm).
//      It also has a few generic tensor tools; unsure whether TF or simple-statistics has either, but probably.
//
//TODO: Also, they still need tests (all the more reason to find existing!).
/**
 * Returns the length of the most deeply nested array.
 * @param {TFNestedArray} inputData
 * @memberof SessionData
 */
function CountLeafElements(inputData) {
    Utils.Assert(inputData.length > 0);
    // find the lowest level of these (potentially) nested arrays
    let deepestArray = inputData;
    while (Array.isArray(deepestArray[0])) {
        Utils.Assert(deepestArray.length > 0);
        deepestArray = deepestArray[0];
    }
    return deepestArray.length;
}
/**
 * Calculate the average of a set of numbers.
 * @param {Array<number>} data
 * @return {number}
 * @memberof SessionData
 */
function FindMean(data) {
    Utils.Assert(data.length > 0);
    let sum = 0;
    for (let i = 0; i < data.length; ++i) {
        sum += data[i];
    }
    const MEAN = sum / data.length;
    return MEAN;
}
/**
 * Calculate the standard deviation of a set of numbers.
 * @param {Array<number>} data
 * @param {number} mean
 * @return {number}
 * @memberof SessionData
 */
function FindStandardDeviation(data, mean) {
    // for each sample, subtract the mean and square the result
    const SQUARED_MEAN_DELTAS = data.map((x) => { return Math.pow(x - mean, 2); });
    const MEAN_OF_ALL_THAT = FindMean(SQUARED_MEAN_DELTAS);
    const STDEV = Math.sqrt(MEAN_OF_ALL_THAT);
    return STDEV;
}
/**
 * Adjusts input data such that each feature has a mean of zero and a variance
 * of one.
 * @param {TFNestedArray} inputData
 * @memberof SessionData
 */
function StandardizeInputs(inputData) {
    Utils.Assert(inputData.length > 0);
    // find the lowest level of these (potentially) nested arrays
    let deepestArray = inputData;
    let tensorDimensions = 1;
    while (Array.isArray(deepestArray[0])) {
        Utils.Assert(deepestArray.length > 0);
        deepestArray = deepestArray[0];
        ++tensorDimensions;
    }
    // prepare a table for every feature value
    const TOTAL_FEATURES = deepestArray.length;
    console.log('Standardizing ' + tensorDimensions + ' dimension tensor with '
        + TOTAL_FEATURES + ' features.');
    const FEATURE_VALUE_TABLE = [];
    for (let i = 0; i < TOTAL_FEATURES; ++i) {
        FEATURE_VALUE_TABLE.push([]);
    }
    // walk this set of (potentially nested) arrays, tabulating the feature
    // values at the bottom
    //NOTE: TODO: This is actually a basic tensor tool, I'm now realizing. Find a
    //            good tensor lib, or start one.
    //			  ...after you check TF's own utils, or course!
    const RECURSIVELY_TABULATE_FEATURES = (a) => {
        Utils.Assert(a.length > 0);
        a.forEach((value, index) => {
            if (Array.isArray(value)) {
                RECURSIVELY_TABULATE_FEATURES(value);
                return;
            }
            /* istanbul ignore next */ //[FUTURE PROOFING]
            if (typeof value !== 'number') {
                throw new Error('Invalid type found while tabulating features '
                    + (typeof value));
            }
            // we've hit a 'bottom' level array (a leaf node); tabulate its
            // feature values
            FEATURE_VALUE_TABLE[index].push(value);
        });
    };
    RECURSIVELY_TABULATE_FEATURES(inputData);
    // find mean and standard deviation for each feature
    const MEANS = [];
    const STANDARD_DEVIATIONS = [];
    for (let i = 0; i < TOTAL_FEATURES; ++i) {
        const FEATURE_MEAN = FindMean(FEATURE_VALUE_TABLE[i]);
        const FEATURE_STDEV = FindStandardDeviation(FEATURE_VALUE_TABLE[i], FEATURE_MEAN);
        MEANS.push(FEATURE_MEAN);
        STANDARD_DEVIATIONS.push(FEATURE_STDEV);
    }
    // walk this set of (potentially) nested arrays, adjusting each feature set
    // to mean zero and variance one
    const RECURSIVELY_STANDARDIZE_FEATURES = (a) => {
        Utils.Assert(a.length > 0);
        a.forEach((value, index, array) => {
            if (Array.isArray(value)) {
                RECURSIVELY_STANDARDIZE_FEATURES(value);
                return;
            }
            /* istanbul ignore next */ //[FUTURE PROOFING]
            if (typeof value !== 'number') {
                throw new Error('Invalid type found during default '
                    + 'standardization ' + (typeof value));
            }
            // we've hit a 'bottom' level array (a leaf node)
            //NOTE: We use this unnecessary, temporary 'sample' as an extra register. This
            //      is purely done because TypeScript does not like my TFInputsArray type.
            //      That type was written to handle nested arrays, but it's causing other
            //      problems, primarily within this file.
            //
            //TODO: This can be 'solved' with this cast: "const NUMBER_ARRAY = array as
            //      Array<number>;", but that seems as ugly as this, if not uglier. I need
            //      further investigation of map/reduce/filter/forEach signatures in TS.
            let sample = Number(array[index]);
            // shift left by the mean, to 'center' everything on zero
            sample -= MEANS[index];
            if (STANDARD_DEVIATIONS[index] === 0) {
                // this category (feature) has no deviation; all samples equal
                // the mean
                // set the value back into its slot
                array[index] = sample;
                return;
            }
            // divide by the standard deviation, so that all categories have a
            // variance of one
            sample /= STANDARD_DEVIATIONS[index];
            // set the value back into its slot
            array[index] = sample;
            /*KEEP: ...until the above TS issue is resolved. This is the original, and
                    there's nothing wrong with it.
                        // shift left by the mean, to 'center' everything on zero
                        array[index] -= MEANS[index];
            
                        if (STANDARD_DEVIATIONS[index] === 0) {
                            // this category (feature) has no deviation; all samples equal
                            // the mean
                            return;
                        }
            
                        // divide by the standard deviation, so that all categories have a
                        // variance of one
                        array[index] /= STANDARD_DEVIATIONS[index];
            */
        });
    };
    RECURSIVELY_STANDARDIZE_FEATURES(inputData);
}
//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Object.freeze(SessionData);
export { SessionData };
//# sourceMappingURL=SessionData.js.map