/* eslint-disable no-nested-ternary,max-lines,complexity */
/**
* TODO document cum ops
* TODO replace is broken
* TODO binarizer
* TODO document imports
* TODO document matrix ops
* TODO document add, mul, div, sub
* TODO document opts, printing presets
* TODO document examples of matrix ops, aggs, functs
* TODO calculate the *base* size of each column (pointers are 8B)
*/
const util = require('util');
const { join, resolve, dirname } = require('path');
const {
readdirSync,
readFileSync,
existsSync,
writeFileSync,
statSync
} = require('fs');
const request = require('sync-request');
const parseCSV = require('csv-parse/lib/sync');
const Column = require('./Column');
const { randInt } = require('./rand');
const {
fmtFloat,
fmtFloatSI,
dtypeRegex,
isNumber,
isString,
isRegExp,
isURL,
isMap,
isSameType,
isGenerator,
isObject,
walkFiles,
isFunction,
transpose
} = require('./utils');
const log = require('./log');
const opts = require('./opts');
const {
PROGRAMMER_PRINTING,
DEFAULT_PRINTING,
MINIMAL_PRINTING
} = require('./presets');
class DataFrame {
/**
* @param {DataFrame|GeneratorFunction|!Object<string|number,number|string|string[]|number[]|ColStr|ColNum|TypedArray>|Array<Array<number|string>>|Array<TypedArray|number[]|string[]|ColStr|ColNum>|!Map<String,number|string|number[]|string[]>} data
* @param {string} [what]
* @param {string[]|?ColStr} [colNames] labels for every column (#cols === #labels)
* @param {?ArrayLike<?DType|'s'>} [dtypes]
*/
constructor(data = [], what = '', colNames = null, dtypes = null) {
const prefix = `${this.constructor.name}.constructor()`;
const logIt = (lvl, msg) => log[lvl](`${prefix} ${msg}`);
const warn = (msg) => logIt('warn', msg);
const info = (msg) => logIt('info', msg);
const debug = (msg) => logIt('debug', msg);
if (what.toLowerCase() !== what) {
debug(`lowercasing "${what}"`);
return new DataFrame(data, what.toLowerCase(), colNames, dtypes);
// another data frame, shallow copy it
} else if (what === 'df') {
info(`data is another ${this.constructor.name}, making a shallow copy`);
return new DataFrame(
[...data.cols],
'cols',
[...data.colNames],
data.dtypes,
);
} else if (isSameType(this, data)) {
info(`detected ${this.constructor.name}`);
return new DataFrame(data, 'df', colNames, dtypes);
} else if (isString(data)) {
if (what.startsWith('url')) {
info(`GET ${data}`);
let newWhat;
if (what === 'url') {
newWhat = data.search(/\.json$/i) >= 0 ? 'json' : 'csv';
} else {
newWhat = what.replace(/^url/, '');
}
const s = request('GET', data).getBody().toString('utf-8');
return new DataFrame(s, newWhat, colNames, dtypes);
} else if (isURL(data)) {
info('detected URL');
return new DataFrame(data, `url${what}`, colNames, dtypes);
} else if (what.startsWith('json')) {
info('data is JSON string, parsing');
if (what === 'json') {
info(`assuming input is JSON object`);
return new DataFrame(JSON.parse(data), 'obj', colNames, dtypes);
} else {
const hint = what.replace(/^json/, '');
info(`using further hint "${hint}"`);
return new DataFrame(JSON.parse(data), hint, colNames, dtypes);
}
} else if (what.startsWith('csv')) {
info('data is CSV string, parsing');
const rows = parseCSV(data, { skip_empty_lines: true, trim: true });
if (colNames === null) {
warn(`column names not provided, assuming first row is column names`);
const header = rows.splice(0, 1)[0];
return new DataFrame(rows, 'rows', header, dtypes);
} else {
debug(`using provided column names ${colNames.toString()}`);
return new DataFrame(rows, 'rows', colNames, dtypes);
}
} else if (what.startsWith('file')) {
info(`data is file "${data}"`);
if (what === 'file') {
const ext = (/\.([^.]+)$/i.exec(data) || [null, 'csv'])[1];
info(`using extension ".${ext}" as hint`);
return new DataFrame(readFileSync(data).toString('utf-8'), ext, colNames, dtypes);
} else {
const newWhat = what.replace(/^file/i, '');
info(`using provided hint for file content type "${newWhat}"`);
return new DataFrame(readFileSync(data).toString('utf-8'), newWhat, colNames, dtypes);
}
} else if (existsSync(data)) {
info(`detected existing file "${data}"`);
return new DataFrame(data, 'file', colNames, dtypes);
// dataset name with *.csv ending (walk recursively)
} else if (data.search(/\.(csv|json)$/i) >= 0) {
info(`data is dataset file name "${data}"`);
for (const path of walkFiles(...opts.DATASETS)) {
if (path.endsWith(data)) {
debug(`located dataset "${path}"`);
return new DataFrame(path, 'file', colNames, dtypes);
}
}
// dataset name WITHOUT *.csv ending
} else {
info(`data is dataset name "${data}" (no extension)`);
for (const path of walkFiles(...opts.DATASETS)) {
if (path.search(new RegExp(`${data}\\.(json|csv)`, 'i')) >= 0) {
debug(`located dataset "${path}"`);
return new DataFrame(path, 'file', colNames, dtypes);
}
}
}
const lookedIn = opts.DATASETS.concat([dirname(data)]).map((p) => resolve(p)).join(', ');
throw new Error(`failed to find file, looked for a dataset in ${lookedIn} (you might want to push your dir to opts.DATASETS OR set 'what', see API)`);
} else if (what.startsWith('gen')) {
info(`input is generator`);
const rows = [];
for (const r of data()) {
rows.push(r);
}
let newWhat = 'rows';
if (what !== 'gen') {
newWhat = what.replace(/^gen/, '');
} else {
info('assuming row generator');
}
return new DataFrame(rows, newWhat, colNames, dtypes);
} else if (isGenerator(data)) {
info(`input is row generator`);
return new DataFrame(data, 'gen', colNames, dtypes);
// javascript Object (PARSED)
} else if (what.startsWith('obj')) {
info('received Object');
// if { String: Number | String }
for (const key of Object.keys(data)) {
const val = data[key];
if (isNumber(val) || isString(val)) {
info('treating keys as column 1 and values as column 2');
return new DataFrame(
[Object.keys(data), Object.values(data)],
'cols',
colNames === null ? ['Key', 'Value'] : colNames,
dtypes,
);
}
}
// else
info('treating keys as column names and values as columns');
return new DataFrame(
Object.values(data),
'cols',
Object.keys(data),
dtypes,
);
} else if (isObject(data)) {
info('detected Object');
return new DataFrame(data, `obj${what}`, colNames, dtypes);
// map { col1 => col2, ... }
} else if (what.startsWith('map')) {
info(`data is Map (len is ${data.size})`);
// reverse logic to Object (above)
for (const v of data.values()) {
if (!isString(v) && !isNumber(v)) {
info('using keys as column names and values as columns');
return new DataFrame(
[...data.values()],
'cols',
[...data.keys()],
dtypes,
);
}
}
info('using keys as column 1 and values as column 2');
const keys = [...data.keys()];
const values = [...data.values()];
return new DataFrame(
[keys, values],
'cols',
colNames === null ? ['Key', 'Value'] : colNames,
dtypes,
);
} else if (isMap(data)) {
info('detected Map');
return new DataFrame(data, `map${what}`, colNames, dtypes);
// array of rows
} else if (what.startsWith('rows')) {
info(`data is array of ${data.length} rows, transposing to columns`);
return new DataFrame(transpose(data), 'cols', colNames, dtypes);
// array of cols
} else if (what.startsWith('cols')) {
debug(`input is array of ${data.length} columns`);
this.cols = data.map((c, cIdx) => {
const printName = `col #${cIdx}${colNames !== null && colNames[cIdx] !== undefined ? ` (${colNames[cIdx]})` : ''}`;
const isCol = Column.isCol(c);
if (isCol) {
debug(`${printName} is already a Column`);
}
const dtypeGiven = dtypes !== null && dtypes[cIdx] !== null && dtypes[cIdx] !== undefined;
debug(dtypeGiven ? `dtype "${dtypes[cIdx]}" given for ${printName}` : `need to guess dtype of ${printName}`);
const noNeedToConvert = isCol && (!dtypeGiven || c.dtype === dtypes[cIdx]);
if (noNeedToConvert) {
debug(`no need to convert ${printName}`);
return c;
}
debug(`converting ${printName}`);
// else
return Column.from(c, !dtypes || !dtypes[cIdx] ? null : dtypes[cIdx], false);
});
const { nCols } = this;
if (colNames === null) {
this.colNames = Column.from(Array(nCols).fill('').map((_, idx) => `col${idx}`));
} else {
this.colNames = Column.from(colNames.map((cName) => cName.toString()), 's');
for (let i = 0; i < nCols; i++) {
const colName = this.colNames[i];
const attributes = {
get() { return this[i]; },
set(newCol) {
this[i] = Column.from(newCol);
debug(`column ${colName} replaced with ${this[i].toString()}`);
},
};
Object.defineProperty(this.cols, colName, attributes);
debug(`registered col.${colName} to point to col[${i}]`);
}
debug(`used provided column names: [${this.colNames.join(', ')}]`);
}
} else if (what === '') {
return new DataFrame(data, 'cols', colNames, dtypes);
} else throw new Error('unrecognised input data');
const attrNames = new Set(this.colNames);
const { nCols } = this;
// index using cols integers AND column names
for (let cIdx = 0; cIdx < nCols; cIdx++) {
attrNames.add(cIdx);
}
/*
* easy access e.g. df.age, df.salary
* easy replacement (assignment) of cols e.g. df.age = df2.age;
* easy broadcasting e.g. df.label = 0;
*/
for (const name of attrNames) {
if (this[name] === undefined && isString(name)) {
Object.defineProperty(this, name, {
get() { return this.col(name); },
set(val) {
this.cols[this.colIdx(name)] = isNumber(val) || isString(val)
// broadcast
? Column.repeat(this.length, val)
// replace
: val;
},
});
}
}
const isUndef = (f) => this[f] === undefined;
// functs and aggs are forwarded to the underlying column
for (const section of Object.keys(opts.AGG)) {
for (const f of opts.AGG[section].filter(isUndef)) {
switch (section) {
case 'num':
this[f] = function (...args) { return this.numeric.agg(f, ...args); };
break;
case 'str':
this[f] = function (...args) { return this.nominal.agg(f, ...args); };
break;
case 'all':
this[f] = function (...args) { return this.agg(f, ...args); };
break;
default:
throw new Error(`unrecognised section "${section}" in opts.AGG`);
}
}
}
// each function is a function (Column => Column)
for (const section of Object.keys(opts.FUNCTS)) {
for (const f of opts.FUNCTS[section].filter(isUndef)) {
switch (section) {
case 'num':
this[f] = function (colId = null, ...args) {
const resultDF = this.clone();
const numDF = this.numeric.call(colId, f, ...args);
const { numColIdxs } = this;
let numCIdx = 0;
for (let cIdx = 0; cIdx < nCols; cIdx++) {
if (numColIdxs.has(cIdx)) {
resultDF.cols[cIdx] = numDF.cols[numCIdx];
numCIdx++;
}
}
return resultDF;
};
break;
case 'str':
this[f] = function (colId = null, ...args) {
const resultDF = this.clone();
const strDF = this.nominal.call(colId, f, ...args);
const { numColIdxs } = this;
let strCIdx = 0;
for (let cIdx = 0; cIdx < nCols; cIdx++) {
if (!numColIdxs.has(cIdx)) {
resultDF.cols[cIdx] = strDF.cols[strCIdx];
strCIdx++;
}
}
return resultDF;
};
break;
case 'all':
this[f] = function (colId = null, ...args) { return this.call(colId, f, ...args); };
break;
default:
throw new Error(`unrecognised section "${section}" in opts.AGG`);
}
}
}
// special cases, when called *without* any param, treat as agg
for (const f of [
'add', 'sub', 'mul', 'div', 'pow'
].filter(isUndef)) {
this[f] = function (...args) {
if (args.length === 0) {
debug(`no args to this.${f}() so treating as aggregate`);
return this.numeric.agg(f);
} else {
const fst = args[0];
return this.numeric.call(fst, f, ...args.slice(1));
}
};
}
}
/**
* Construct a DataFrame from columns.
*
* @param {...ArrayLike<*>} cols
* @returns {DataFrame}
*/
static of(...cols) { return new DataFrame(cols, 'cols'); }
* [Symbol.iterator]() {
const { rows } = this;
const n = this.length;
for (let rIdx = 0; rIdx < n; rIdx++) {
yield rows[rIdx];
}
}
get rows() {
return new Proxy(this.cols, {
/**
* @param {Column[]} cols
* @param {number} idx
* @returns {Array<*>}
*/
get(cols, idx) { return cols.map((col) => col[idx]); },
/**
*
* @param {Column[]} cols
* @param {Array<*>} row
* @returns {boolean}
*/
has(cols, row) {
const nCols = cols.length;
if (nCols === 0) {
return false;
}
const nRows = cols[0].length;
for (let rIdx = 0; rIdx < nRows; rIdx++) {
let eq = true;
for (let cIdx = 0; cIdx < nCols; cIdx++) {
if (row[cIdx] !== cols[cIdx][rIdx]) {
eq = false;
break;
}
}
if (eq) {
return true;
}
}
return false;
},
/**
* @param {Column[]} cols
* @param {number} rIdx
* @param {Array<*>} val
*/
set(cols, rIdx, val) {
for (let cIdx = 0; cIdx < cols.length; cIdx++) {
// eslint-disable-next-line no-param-reassign
cols[cIdx][rIdx] = val[cIdx];
}
}
});
}
/**
* @returns {Set<number>} set of column indexes
*/
get numColIdxs() {
const colIdxs = new Set();
const { dtypes, nCols } = this;
for (let cIdx = 0; cIdx < nCols; cIdx++) {
if (dtypes[cIdx].search(dtypeRegex) >= 0) {
colIdxs.add(cIdx);
}
}
return Object.freeze(colIdxs);
}
/**
* @returns {Set<number>} set of column indexes
*/
get strColIdxs() {
const numCols = this.numColIdxs;
return Object.freeze(new Set(this.colNames.filter((_, idx) => !numCols.has(idx))));
}
/**
* @param {string|number} colId
* @returns {number} column index
*/
colIdx(colId) {
const msg = (m) => `${this.constructor.name}.colIdx() ${m}`;
// resolve named column
if (Number.isInteger(colId)) {
// resolve negative idx
if (colId < 0) {
const newColIdx = this.nCols + colId;
log.debug(msg(`colId ${colId} negative, resolving to #${newColIdx}`));
return this.colIdx(newColIdx);
} else if (colId >= this.nCols) {
throw new Error(msg(`#${colId}, no such column, out of bounds`));
} else {
return colId;
}
} else if (isString(colId)) {
const idx = this.colNames.findIndex((colName) => colName === colId);
if (idx < 0) {
throw new Error(msg(`failed to find matching column for "${colId}"`));
}
log.debug(msg(`col "${colId}" is string, resolved to #${idx}`));
return idx;
} else throw new Error(msg(`bad input, expected number or string but got ${colId}`));
}
/**
* @returns {DataFrame} a data frame with numeric cols
*/
get numeric() {
const { numColIdxs } = this;
return this.select((cId) => numColIdxs.has(this.colIdx(cId)));
}
/**
* @returns {DataFrame} a data frame with numeric cols
*/
get nominal() {
const { numColIdxs } = this;
return this.select((cId) => !numColIdxs.has(this.colIdx(cId)));
}
/**
* @param {string|number} colId
* @returns {string[]|TypedArray} column
*/
col(colId) { return this.cols[this.colIdx(colId)]; }
/**
* @param {?number} [n]
* @returns {DataFrame} data frame
*/
head(n = null) {
if (n === null) {
return this.tail(opts.HEAD_LEN);
}
return this.slice(0, n);
}
/**
* @param {?number} [n]
* @returns {DataFrame} data frame
*/
tail(n = null) {
if (n === null) {
return this.tail(opts.HEAD_LEN);
}
return this.slice(this.length - n, this.length);
}
/**
* @returns {number} number of rows
*/
get length() {
if (this.cols[0] === undefined) {
return 0;
} else {
return this.cols[0].length;
}
}
/**
* @returns {number} number of columns
*/
get nCols() { return this.cols.length; }
/**
* @param {...(string|number)} colIds
* @returns {DataFrame} data frame
*/
dtype(...colIds) {
if (colIds.length === 0) {
return this.dtype(...this.colNames);
}
const colIdxs = colIds.map((id) => this.colIdx(id));
return this.select((cIdx) => colIdxs.indexOf(cIdx) >= 0).agg((col) => col.dtype).rename(1, 'dtype');
}
/**
* @returns {ColStr} data types for all columns
*/
get dtypes() { return Column.from(this.cols.map((c) => c.dtype), 's'); }
/**
* @param {...(number|string|RegExp|function((string|number)): boolean)} params
* @returns {DataFrame} data frame
*/
select(...params) {
if (params.length === 0) {
throw new Error('the DF is empty because no column IDs provided, try: df.select(0, -2, 3)');
} else if (params.length === 1 && isRegExp(params[0])) {
const regex = params[0];
return this.select((cName) => isString(cName) && cName.search(regex) >= 0);
} else if (params.length === 1 && isFunction(params[0])) {
const f = params[0];
return this.select(...(this.colNames.filter((name) => f(name) || f(this.colIdx(name)))));
} else {
const cols = [];
const colNames = [];
const dtypes = [];
for (const i of new Set(params.map((id) => this.colIdx(id)))) {
cols.push(this.cols[i]);
colNames.push(this.colNames[i]);
dtypes.push(this.dtypes[i]);
}
return new DataFrame(cols, 'cols', colNames, dtypes);
}
}
/**
* @param {...(number|string|RegExp|function((string|number)): boolean)} params
* @returns {DataFrame} data frame
*/
selectRows(...params) {
if (params.length === 0) {
throw new Error('the DF is empty because no row IDs provided, try: df.selectRows(0, -2, 3)');
} else if (params.length === 2 && isRegExp(params[0])) {
return this.selectRows((val) => val.search(params[0]) >= 0, params[1]);
} else if (params.length === 2 && isFunction(params[0])) {
const colId = params[1];
const f = params[0];
// focus on one column
const col = this.col(colId);
const tests = Array(col.length).fill(false).map((_, rIdx) => f(col[rIdx], rIdx));
return new DataFrame(
this.cols.map((c) => c.filter((_, rIdx) => tests[rIdx])),
'cols',
[...this.colNames],
[...this.dtypes],
);
} else if (params.length === 1 && isFunction(params[0])) {
throw new Error(`${this.constructor.name}.selectRows() not implemented yet`);
} else if (params.reduce((ok, focus) => ok && isNumber(focus) && Number.isInteger(focus))) {
throw new Error(`${this.constructor.name}.selectRows() not implemented yet`);
} else {
throw new Error(`${this.constructor.name}.selectRows() didn't understand args ${params.map((x) => x.toString()).join(', ')}`);
}
}
/**
* @param {...(string|number)} params pairs of colId, newName
* @returns {DataFrame} data frame with renamed col
*/
rename(...params) {
const msg = (m) => `${this.constructor.name}.rename() ${m}`;
if (params.length === 1 && Array.isArray(params[0])) {
const pairs = params[0].map((newName, cIdx) => [cIdx, newName]);
const args = pairs.reduce((pair1, pair2) => pair1.concat(pair2), []);
return this.rename(...args);
} else if (params.length === 1 && this.nCols === 1) {
log.info(msg('colId not specified for rename'));
return this.rename(0, params[0]);
} else if (params.length % 2 !== 0) {
throw new Error(msg('you need to provide pairs of colId, newName (e.g. df.rename(1, "Width", -2, "Length"))'));
}
const colNames = [...this.colNames];
for (let i = 1; i < params.length; i += 2) {
const colId = params[i - 1];
const newName = params[i];
const colIdx = this.colIdx(colId);
colNames[colIdx] = newName;
}
return new DataFrame([...this.cols], 'cols', colNames, [...this.dtypes]);
}
/**
* @param {?number|string} [colId]
* @param {string|function(Col): Col} f
* @param {...*} args
* @returns {DataFrame} data frame with f applied to colId
*/
call(colId = null, f, ...args) {
if (isString(f)) {
return this.call(colId, (col) => col[f](...args));
}
if (colId === null) {
log.info(`${this.constructor.name}.call() colId not specified, running for all cols`);
return [...this.colNames].reduce((df, cName) => df.call(cName, f, ...args), this);
}
const cols = [...this.cols];
const cIdx = this.colIdx(colId);
cols[cIdx] = f(cols[cIdx], ...args);
return new DataFrame(cols, 'cols', [...this.colNames], this.dtypes.map((t, idx) => idx === cIdx ? null : t));
}
/**
* @param {number} idx
* @returns {string}
* @private
*/
_colName(idx) {
return `col #${idx}${this.colNames[idx] === idx ? '' : ` (${this.colNames[idx]})`} dtype ${this.dtypes[idx]}`;
}
/**
* @param {function(ColNum): number} [f]
* @param {...*} args
* @returns {DataFrame} data frame
*/
agg(f = (xs) => xs.length, ...args) {
if (isString(f)) {
return this.agg((col) => col[f](...args)).rename(-1, f);
}
const colNames = [];
const aggResults = [];
const { nCols } = this;
for (let cIdx = 0; cIdx < nCols; cIdx++) {
const col = this.cols[cIdx];
const colName = this.colNames[cIdx];
colNames.push(colName);
aggResults.push(f(col, ...args));
}
return new DataFrame(
[colNames, aggResults.map((x) => x.toString())],
'cols',
['column', 'agg'],
['s', null],
);
}
/**
* @param {number|string} colId
* @param {function(ColNum): number} [f]
* @param {...*} args
* @returns {DataFrame} data frame
*/
groupBy(colId, f = (xs) => xs.length, ...args) {
if (isString(f)) {
return this.groupBy(colId, (xs) => xs[f](...args)).rename(-1, f);
}
const cIdx = this.colIdx(colId);
const index = new Map();
for (const r of this) {
const v = r[cIdx];
const maybe = index.get(v);
if (maybe === undefined) {
index.set(v, [r]);
} else {
maybe.push(r);
}
}
for (const k of index.keys()) {
index.set(k, f(index.get(k)));
}
return new DataFrame(index, 'map', [this.colNames[cIdx], 'result'], [this.dtypes[cIdx], null]);
}
/**
* @param {string|function(Column, Column, Number): Column} f
* @param {?DataFrame} [other]
* @returns {DataFrame} data frame
*/
connect(f, other = null) {
if (other === null) {
return this.connect(f, this);
}
if (isString(f)) {
return this.connect((xs, ys, idx) => xs[f](ys, idx), other);
}
const cols = Array(this.cols.length).fill(null);
const colNames = Array(this.cols.length).fill(null);
const { nCols } = this;
for (let cIdx = 0; cIdx < nCols; cIdx++) {
cols[cIdx] = f(this.cols[cIdx], other.cols[cIdx], cIdx);
if (!isNumber(this.colNames[cIdx])) {
colNames[cIdx] = this.colNames[cIdx];
} else if (!isNumber(other.colNames[cIdx])) {
colNames[cIdx] = other.colNames[cIdx];
} else {
colNames[cIdx] = null;
}
}
return new DataFrame(cols, 'cols', colNames);
}
/**
* @param {function(Col, Col): (number|string)} f
* @param {?boolean} [withNames]
* @param {?boolean} [isCommutative]
* @param {*} [identity]
* @returns {DataFrame} data frame
*/
matrix(f, withNames = true, isCommutative = false, identity = null, ...args) {
if (isString(f)) {
return this.matrix((xs, ys) => xs[f](ys, ...args), withNames, isCommutative, identity);
}
const colIdxs = [];
const rows = [];
const cache = {};
const { nCols } = this;
const msg = (m) => `${this.constructor.name}.matrix() ${m}`;
const debug = (m) => log.debug(msg(m));
for (let yIdx = 0; yIdx < nCols; yIdx++) {
// else
colIdxs.push(yIdx);
rows.push([]);
for (let xIdx = 0; xIdx < nCols; xIdx++) {
// some ops have a fixed return value when applied to self f(xs, xs) == id
if (identity !== null && xIdx === yIdx) {
debug(`[skipping identity yIdx == xIdx, f(${this._colName(xIdx)}, ${this._colName(yIdx)}) = ${identity}`);
rows[rows.length - 1].push(identity);
continue;
}
const col = this.cols[yIdx];
const other = this.cols[xIdx];
let result;
// sometimes order does not matter: f(xs, ys) === f(ys, xs)
if (isCommutative) {
result = cache[`${yIdx}:${xIdx}`];
// try swap
if (result === undefined) {
result = cache[`${xIdx}:${yIdx}`];
}
// if fail, compute
if (result === undefined) {
result = f(col, other, ...args);
cache[`${yIdx}:${xIdx}`] = result;
debug(`computed and cached f(${this._colName(yIdx)}, ${this._colName(xIdx)})`);
} else {
debug(`CACHE HIT for f(${this._colName(yIdx)}, ${this._colName(xIdx)})`);
}
rows[rows.length - 1].push(result);
} else {
rows[rows.length - 1].push(f(col, other, ...args));
}
}
}
// numeric col names in the order of appearing in the matrix
const colNames = this.colNames.filter((_, cIdx) => colIdxs.indexOf(cIdx) >= 0);
const df = new DataFrame(rows, 'rows', colNames);
return withNames ? df.prependCol(this.colNames.clone(), 'column') : df;
}
/**
* @param {string[]|number[]|TypedArray} col
* @param {?String} [name]
* @param {?DType|'s'} [dtype]
* @returns {DataFrame} data frame
*/
appendCol(col, name = null, dtype = null) {
const cols = [...this.cols];
const colNames = [...this.colNames];
const dtypes = [...this.dtypes];
cols.push(col);
colNames.push(name === null ? colNames.length : name);
dtypes.push(dtype === null ? null : dtype);
return new DataFrame(
cols,
'cols',
colNames,
dtypes,
);
}
/**
* @param {string[]|number[]|TypedArray} col
* @param {?String} [name]
* @param {?DType|'s'} [dtype]
* @returns {DataFrame} data frame
*/
prependCol(col, name = null, dtype = null) {
const cols = [...this.cols];
const colNames = [...this.colNames];
const dtypes = [...this.dtypes];
cols.unshift(col);
colNames.unshift(name === null ? colNames.length : name);
dtypes.unshift(dtype === null ? null : dtype);
return new DataFrame(
cols,
'cols',
colNames,
dtypes,
);
}
/**
* @param {DataFrame} other
* @param {'col'|'row'|'cols'|'rows'|0|1} [axis]
* @returns {DataFrame} data frame
*/
concat(other, axis = 0) {
if (isNumber(axis)) {
if (axis < 0) {
return this.concat(other, axis + 2);
} else if (axis === 0) {
return this.connect('concat', other);
} else if (axis === 1) {
// else if concat HORIZONTALLY
const isDigit = /^\d+$/; // check if has proper column names or just indexes
// if columns are indexes, shift them
let colNames;
if (other.colNames.filter((c) => c.toString().search(isDigit)).length === other.colNames.length) {
colNames = this.colNames.concat(other.colNames.map((cIdx) => this.colNames.length + cIdx));
} else {
colNames = this.colNames.concat(other.colNames);
}
let renamed;
/*
* deal with duplicate col names (add a num to the -- e.g.: Age, Salary, Age2 ...)
* make sure that name clash didn't arise as a result of previous renaming
*/
do {
renamed = false; // clear
for (let cIdx = 0; cIdx < colNames.length; cIdx++) {
const name = colNames[cIdx];
let count = 2;
for (let ptr = cIdx + 1; ptr < colNames.length; ptr++) {
const name2 = colNames[ptr];
if (name === name2) {
colNames[ptr] += count.toString();
renamed = true;
count++;
}
}
}
} while (renamed);
return new DataFrame(this.cols.concat(other.cols), 'cols', colNames);
}
} else if (isString(axis)) {
if (axis.search(/^col/i) >= 0) {
return this.concat(other, 1);
} else if (axis.search(/^row/i) >= 0) {
return this.concat(other, 0);
}
}
throw new Error(`${this.constructor.name}.concat() invalid axis argument ${axis}, try 0, 1, 'rows' or 'cols'`);
}
/**
* @param {...number} idxs PAIRS of indexes
* @returns {DataFrame} a data frame
*/
slice(...idxs) {
if (idxs.length === 0) {
throw new Error('you need to specify indexes (e.g. df.slice(0, 10), df.slice(-20, -10))');
} else if (idxs.length % 2 !== 0) {
idxs.push(this.length); // odd number of idxs
/*
* e.g. slice(0) -> slice(0, end)
* e.g. slice(0, 10, 20) -> slice(0, 10, 20, end)
*/
} else if (idxs.some((idx) => idx < 0)) {
// resolve negative indexes
return this.slice(...(idxs.map((idx) => (idx < 0 ? idx + this.length : idx))));
}
const { nCols } = this;
const cols = Array(nCols).fill(0);
// for every pair of indexes
for (let i = 1; i < idxs.length; i += 2) {
const lBound = idxs[i - 1];
const rBound = idxs[i];
for (let cIdx = 0; cIdx < nCols; cIdx++) {
const col = this.cols[cIdx];
cols[cIdx] = col.subarray(lBound, rBound);
}
}
return new DataFrame(cols, 'cols', [...this.colNames]);
}
/**
* E.g. sliceCols(0) -> sliceCols(0, end).
* E.g. sliceCols(0, 10, 20) -> sliceCols(0, 10, 20, end).
*
* @param {...(number|string)} slices
* @returns {DataFrame} data frame
*/
sliceCols(...slices) {
const msg = (m) => `${this.constructor.name}.sliceCols() ${m}`;
if (slices.length === 0) {
throw new Error(msg('no slice idxs specified (e.g. df.sliceCols(0, -1))'));
} else if (slices.length % 2 !== 0) {
// odd number of idxs
return this.sliceCols(...slices, this.nCols - 1);
}
// collect column idxs
const colIds = new Set();
const { nCols } = this;
for (let i = 1; i < slices.length; i += 2) {
const lBound = this.colIdx(slices[i - 1]);
let rBound;
// is num out of bounds
if (isNumber(slices[i]) && slices[i] >= nCols) {
log.warn(msg(`upper bound > #cols, you wanted cols up to ${slices[i]}th but there are ${nCols}`));
rBound = nCols - 1;
} else {
rBound = this.colIdx(slices[i]);
}
for (let cIdx = lBound; cIdx <= rBound; cIdx++) {
colIds.add(cIdx);
}
}
// then select them
return this.select(...colIds);
}
/**
* @param {...(number|string|RegExp|function((string|number)): boolean)} params
* @returns {DataFrame} data frame
*/
drop(...params) {
if (params.length === 0) {
throw new Error('you need to select a column (e.g. df.drop(0, -2, -4))');
}
const { colNames } = this.select(...params);
return this.select((cName) => isString(cName) && colNames.indexOf(cName) < 0);
}
/**
* @param {function((string|number), number): boolean} f
* @param {?string|number} [colId]
* @returns {DataFrame} data frame
*/
filter(f, colId = null) {
if (colId === null) {
log.warn(`${this.constructor.name}.filter() no columns specified so running for all`);
return [...this.colNames].reduce((df, cName) => df.filter(f, cName), this);
}
// else focus on one column
const col = this.col(colId);
const tests = Array(col.length).fill(false).map((_, rIdx) => f(col[rIdx], rIdx));
return new DataFrame(
this.cols.map((c) => c.filter((_, rIdx) => tests[rIdx])),
'cols',
[...this.colNames],
[...this.dtypes],
);
}
/**
* @param {string|number} val
* @param {?string|number} colId
* @param {"not is"|"is"|"lt"|"gt"|"gte"|"lte"|">"|">="|"<"|"<="|"="|"!="|"!=="|"=="|"less than"|"greater than"|"not equal"|"ne"|"equal"|"equals"|"less than or equal"|"greater than or equal"} op
* @returns {DataFrame} data frame
*/
where(val, colId = null, op = '=') {
switch (op.toLowerCase()) {
case '!=': return this.filter((x) => x !== val, colId);
case '!==': return this.where(val, colId, '!=');
case 'ne': return this.where(val, colId, '!=');
case 'not equal': return this.where(val, colId, '!=');
case 'not equals': return this.where(val, colId, '!=');
case 'is': return this.filter((x) => Object.is(x, val), colId);
case 'not is': return this.filter((x) => !Object.is(x, val), colId);
case '!is': return this.where(val, colId, 'not is');
case 'is not': return this.where(val, colId, 'not is');
case '=': return this.filter((x) => x === val, colId);
case '==': return this.where(val, colId, '=');
case 'equal': return this.where(val, colId, '=');
case 'equals': return this.where(val, colId, '=');
case '>': return this.filter((x) => x > val, colId);
case 'gt': return this.where(val, colId, '>');
case 'greater than': return this.where(val, colId, '>');
case '<': return this.filter((x) => x < val, colId);
case 'lt': return this.where(val, colId, '<');
case 'less than': return this.where(val, colId, '<');
case '<=': return this.filter((x) => x <= val, colId);
case 'lte': return this.where(val, colId, '<=');
case 'less than or equal': return this.where(val, colId, '<=');
case '>=': return this.filter((x) => x >= val, colId);
case 'gte': return this.where(val, colId, '>=');
case 'greater than or equal': return this.where(val, colId, '>=');
default: throw new Error(`unrecognised op ${op}`);
}
}
/**
* @param {*} val
* @param {...*} colIds
* @returns {DataFrame} data frame without val in colIds
*/
removeAll(val, ...colIds) {
if (colIds.length === 0) {
if (this.nCols === 0) {
throw new Error(`empty ${this.constructor.name}, no column to perform removeAll on`);
}
return this.removeAll(val, ...this.colNames);
}
return [...colIds].reduce((df, cName) => df.where(val, cName, 'not is'), this);
}
/**
* @param {?number|?String} [colId]
* @param {RegExp|string|number} pat
* @param {string|number} repl
* @param {...*} args
* @returns {DataFrame} data frame with f applied to colId
*/
replace(colId = null, pat, repl, ...args) {
return this[isNumber(pat) ? 'numeric' : 'nominal'].call(colId, 'replace', pat, repl, ...args);
}
/**
* @param {...(number|string)} colIds
* @returns {DataFrame} data frame
*/
removeOutliers(...colIds) {
// by default compute for all (numeric) columns
if (colIds.length === 0) {
log.info(`${this.constructor.name}.removeOutliers() running for all cols`);
return this.removeOutliers(...this.colNames);
}
const cols = [...this.cols];
const numCols = this.numColIdxs;
// indexes of *NUMERIC* columns
const numColIdxs = new Set(colIds.map((id) => this.colIdx(id)).filter((cIdx) => numCols.has(cIdx)));
// store {Q1, Q3, idx} for every *NUMERIC* column
const IQRs = this.colNames
// get column indexes
.map((_, idx) => idx)
// and now get all NUMERIC columns while leaving gaps to preserve indexing
.map((idx) => (numColIdxs.has(idx) ? this.cols[idx] : null))
// and now computer IQ1 and IQ3 for all NUMERIC columns while leaving gaps to preserve indexing
.map((maybeCol) => (maybeCol === null ? null : ({ Q1: maybeCol.Q1(), Q3: maybeCol.Q3() })));
// store results of testing for all rows
const tests = Array(this.length).fill(true);
const { nCols } = this;
// see if this row is an outlier by looking at each numeric column
for (let rIdx = 0; rIdx < this.length; rIdx++) {
for (let cIdx = 0; cIdx < nCols; cIdx++) {
if (numColIdxs.has(cIdx)) {
const col = cols[cIdx];
const val = col[rIdx];
// if value is in Q1 .. Q3 then accept
if (val < IQRs[cIdx].Q1 || val > IQRs[cIdx].Q3) {
tests[rIdx] = false;
break;
}
}
}
}
for (let cIdx = 0; cIdx < nCols; cIdx++) {
// filter every col according to pre-computed boolean vals above
cols[cIdx] = cols[cIdx].filter((_, rIdx) => tests[rIdx]);
}
return new DataFrame(cols, 'cols', [...this.colNames], [...this.dtypes]);
}
/**
* @param {?number|string} [colId]
* @param {'asc'|'des'|function(string, string): number|function(number, number): number} [ord]
* @returns {DataFrame}
*/
sort(colId = null, ord = 'asc') {
if (colId === null) {
if (this.nCols === 1) {
log.warn(`colId not provided but there is only 1 col so sorting on ${this._colName(0)}`);
return this.sort(0, ord);
} else {
throw new Error('you need to select a column (e.g. df.sort(0))');
}
}
if (isFunction(ord)) {
return new DataFrame([...this].sort(ord), 'rows', [...this.colNames], [...this.dtypes]);
}
const cIdx = this.colIdx(colId);
return this.sort(cIdx, ord.search(/^asc/i) >= 0
// eslint-disable-next-line no-nested-ternary
? (r1, r2) => r1[cIdx] > r2[cIdx] ? 1 : r1[cIdx] < r2[cIdx] ? -1 : 0
// eslint-disable-next-line no-nested-ternary
: (r1, r2) => r1[cIdx] > r2[cIdx] ? -1 : r1[cIdx] < r2[cIdx] ? 1 : 0);
}
/**
* Shuffle the data frame.
*
* @returns {DataFrame} data frame with shuffle rows
*/
shuffle() {
const rIdxs = Array(this.length).fill(0).map((_, idx) => idx);
const rows = [];
const rs = this.rows;
for (let i = 0; i < this.length; i++) {
const idx = Math.floor(Math.random() * rIdxs.length);
const rIdx = rIdxs[idx];
rows.push(rs[rIdx]);
rIdxs.splice(idx, 1);
}
return new DataFrame(rows, 'rows', [...this.colNames], [...this.dtypes]);
}
/**
* @param {?number} [n] number of cols to select
* @param {string|!function(Column): number} [agg]
* @param {...*} args
* @returns {DataFrame} data frame
*/
nBest(n = null, agg = 'var', ...args) {
if (n === null) {
return this.nBest(opts.NBEST, agg);
}
if (n > this.nCols) {
log.warn(`${this.constructor.name}.nBest(n = ${n}), but there is ${this.nCols} cols`);
return this.nBest(this.nCols, agg);
}
return this.agg(agg, ...args)
.sort(-1, 'des')
.slice(0, n);
}
/**
* @param {?number} n ratio or number of elements
* @param {?boolean} wr with replacement
* @returns {DataFrame} data frame
*/
sample(n = null, wr = true) {
if (n === null) {
return this.sample(opts.SAMPLE_SIZE, wr);
} else if (n < 1) {
// tODO optimize DF.sample(n, wr)
return this.sample(Math.floor(n * this.length));
} else if (n >= this.length) {
log.warn(`${this.constructor.name}.sample() sample size ${n} >= nRows ${this.length}`);
return this.sample(this.length - 1, wr);
}
const rows = [];
const rs = this.rows;
if (wr) {
while (rows.length < n) {
const rIdx = randInt(0, this.length);
rows.push(rs[rIdx]);
}
} else {
const idxs = Array(this.length).fill(0).map((_, idx) => idx);
while (rows.length < n) {
// this is a bit confusing because you are indexing an index
const i = randInt(0, idxs.length);
const rowIdx = idxs[i];
const row = rs[rowIdx];
rows.push(row);
idxs.splice(i, 1); // remove i from possible idxs
}
}
return new DataFrame(rows, 'rows', [...this.colNames]);
}
/**
* Produce a count table for values of a column.
*
* @param {string|number} colId
* @returns {DataFrame} data frame of counts
*/
counts(colId) { return this.groupBy(colId, (xs) => xs.length); }
/**
* Produce a ps table for values of a column.
*
* @param {string|number} colId
* @returns {DataFrame} data frame of pss
*/
ps(colId) { return this.counts(colId).map(-1, (x) => x / this.length); }
/**
* One hot encode a column.
*
* @param {string|number} colId
* @returns {DataFrame} one hot encoded table
*/
oneHot(colId = null) {
if (colId === null) {
if (this.nCols === 1) {
log.info(`${this.constructor.name}.oneHot() colId not specified for counts, executing on the only col`);
return this.oneHot(0);
} else {
throw new Error('you need to select a column (e.g. `df.oneHot(0)`)');
}
}
const col = this.col(colId);
const k = col.max() + 1;
const cols = Array(k)
.fill(0)
.map(() => Column.empty(col.length, 'u8'));
for (let rowIdx = 0; rowIdx < col.length; rowIdx++) {
const val = col[rowIdx];
cols[val][rowIdx] = 1;
}
return new DataFrame(cols, 'cols');
}
/**
* @param {number} [p]
* @param {?boolean} [withNames]
* @returns {DataFrame} data frame
*/
distance(p = 2, withNames = true) {
return this.numeric.matrix('distance', withNames, true, 0, p);
}
/**
* @param {?boolean} [withNames]
* @returns {DataFrame} data frame
*/
dot(withNames = true) {
return this.numeric.matrix('dot', withNames, true, null);
}
/**
* @param {?boolean} [withNames]
* @returns {DataFrame} data frame
*/
cov(withNames = true) {
return this.numeric.matrix('cov', withNames, true, null);
}
/**
* @param {?boolean} [withNames]
* @returns {DataFrame} data frame
*/
corr(withNames = true) {
return this.numeric.matrix('corr', withNames, true, 1);
}
/**
* Summaries each column.
*
* @returns {DataFrame} data frame
*/
summary() {
if (this.numColIdxs.size !== this.cols.length) {
log.info('omitting str columns from summary');
return this.numeric.summary();
}
const { dtypes, nCols } = this;
const info = {
column: Column.empty(nCols, 's'),
dtype: Column.empty(nCols, 's'),
min: Column.empty(nCols),
max: Column.empty(nCols),
range: Column.empty(nCols),
mean: Column.empty(nCols, 'f64'),
stdev: Column.empty(nCols, 'f64'),
};
for (let cIdx = 0; cIdx < nCols; cIdx++) {
info.column[cIdx] = this.colNames[cIdx];
info.dtype[cIdx] = dtypes[cIdx];
const col = this.cols[cIdx];
info.min[cIdx] = col.min();
info.max[cIdx] = col.max();
info.range[cIdx] = info.max[cIdx] - info.min[cIdx];
info.mean[cIdx] = col.mean();
info.stdev[cIdx] = col.stdev();
}
return new DataFrame(info);
}
/**
* @returns {DataFrame} shallow copy of the data frame
*/
copy() {
return new DataFrame([...this.cols], 'cols', [...this.colNames], [...this.dtypes]);
}
/**
* @returns {DataFrame} clone (deep copy) of the data frame
*/
clone() {
const { nCols } = this;
const newCols = Array(nCols).fill(0);
for (let cIdx = 0; cIdx < nCols; cIdx++) {
const col = this.cols[cIdx];
newCols[cIdx] = col.clone();
}
return new DataFrame(newCols, 'cols', [...this.colNames], [...this.dtypes]);
}
/**
* @param {?string} [fileName]
* @returns {Object<number[]|string[]>|undefined} dictionary
*/
toObj(fileName = null) {
if (fileName !== null) {
log.info(`writing Object to "${fileName}"`);
writeFileSync(fileName, JSON.stringify(this.toObj()), { encoding: 'utf-8', flag: 'w' });
return undefined;
}
const dict = {};
const { nCols } = this;
for (let cIdx = 0; cIdx < nCols; cIdx++) {
const cName = this.colNames[cIdx];
const col = this.cols[cIdx];
dict[cName] = [...col];
}
return dict;
}
/**
* @param {?string} fileName
* @returns {string|undefined}
*/
toJSON(fileName = null) {
if (fileName !== null) {
log.info(`writing JSON to "${fileName}"`);
writeFileSync(fileName, this.toJSON(), { encoding: 'utf-8', flag: 'w' });
return undefined;
}
return JSON.stringify(this.toObj());
}
/**
* @param {?string} [fileName]
* @returns {string|undefined}
*/
toCSV(fileName = null) {
if (fileName !== null) {
log.info(`writing CSV to "${fileName}"`);
writeFileSync(fileName, this.toCSV(), { encoding: 'utf-8', flag: 'w' });
return undefined;
}
const stringify = require('csv-stringify/lib/sync');
const a = [...this];
a.unshift(this.colNames);
return stringify(a);
}
/**
* @param {string} tableName
* @param {?string} [fileName]
* @returns {string}
*/
toSQLTableDef(tableName = 'Table', fileName = null) {
if (fileName !== null) {
log.info(`writing SQL TABLE DEFINITION to ${fileName}`);
writeFileSync(fileName, this.toSQLTableDef(tableName), { encoding: 'utf-8', flag: 'w' });
return undefined;
}
const dtypeMap = {
f32: 'REAL',
f64: 'REAL',
i8: 'INT',
i16: 'INT',
i32: 'INT',
u8: 'INT',
u16: 'INT',
u32: 'INT',
s: 'TEXT',
};
return `
CREATE TABLE IF NOT EXISTS ${tableName} (
${this.colNames.some((col) => col.toLowerCase() === 'id') ? '' : `id int NOT NULL AUTO_INCREMENT,`}
${this.colNames.map((cName, cIdx) => `${cName} ${dtypeMap[this.dtypes[cIdx]]}`).join(',\n ')}
);`.trim();
}
/**
* @param {string} [tableName]
* @param {?string} [fileName]
* @returns {string}
*/
toSQLInserts(tableName = 'Table', fileName = null) {
if (fileName !== null) {
log.info(`writing SQL INSERTs to ${fileName}`);
writeFileSync(fileName, this.toSQLInserts(tableName), { encoding: 'utf-8', flag: 'w' });
return undefined;
}
const { rows } = this;
const xs = Array(this.length);
for (let i = 0; i < this.length; i++) {
xs[i] = rows[i];
}
return xs.map((row) => `INSERT INTO ${tableName} (${this.colNames.join(', ')}) VALUES (${row.map((val) => val.toString()).join(', ')});`).join('\n');
}
/**
* @param {string} [tableName]
* @param {?string} [fileName]
* @returns {string|undefined}
*/
toSQLUpdates(tableName = 'Table', fileName = null) {
if (fileName !== null) {
log.info(`writing SQL UPDATEs to ${fileName}`);
writeFileSync(fileName, this.toSQLUpdates(tableName), { encoding: 'utf-8', flag: 'w' });
return undefined;
}
const { rows } = this;
const xs = Array(this.length);
for (let i = 0; i < this.length; i++) {
xs[i] = rows[i];
}
return xs.map((row) => `UPDATE ${tableName} SET ${row.map((val, cIdx) => `${this.colNames[cIdx]} = ${val.toString()}`).join(', ')};`).join('\n');
}
/**
* @param {?string} [fileName]
* @returns {string|undefined} HTML
*/
toHTML(fileName = null) {
if (fileName !== null) {
log.info(`writing HTML to ${fileName}`);
writeFileSync(fileName, this.toHTML(), { encoding: 'utf-8', flag: 'w' });
return undefined;
}
const chunks = [];
chunks.push('<table>');
chunks.push('<tr>');
for (const name of this.colNames) {
chunks.push('<th>');
chunks.push(name.toString());
chunks.push('</th>');
}
chunks.push('</tr>');
for (let rIdx = 0; rIdx < this.length; rIdx++) {
chunks.push('<tr>');
const { rows } = this;
for (const val of rows[rIdx]) {
chunks.push(`<td>${val.toString()}</td>`);
}
chunks.push('</tr>');
}
chunks.push('</table>');
return chunks.join('');
}
/**
* @returns {string[]} datasets
*/
static get dataSets() {
const filters = [(path) => path.search(/\.(csv|json)$/i) >= 0];
const nodeStack = [...opts.DATASETS];
const datasets = new Set();
let i = 0;
while (i < nodeStack.length) {
const path = resolve(nodeStack[i]);
if (path.search(/\.[._]\w+$|node_modules$/) >= 0) {
i++;
continue;
}
const stats = statSync(path);
if (stats.isDirectory()) {
for (const f of readdirSync(path)) {
nodeStack.push(join(path, f));
}
} else if (stats.isFile()) {
if (filters.reduce((ok, f) => ok || f(path), false)) {
datasets.add(path);
}
}
i++;
}
return [...datasets];
}
/**
* @returns {Record<string,*>}
*/
static get opts() { return opts; }
/**
* @param {?number} [n]
* @param {?number} [m]
* @param {function(string): void} sink
*/
print(n = null, m = null, sink = console.log) { sink(this.toString(n, m)); }
/**
* @param {string} opt
*/
static setPrinting(opt = 'minimal') {
if (opt.search(/^mini/i) >= 0) {
log.warn('minimal printing ON');
Object.assign(opts, MINIMAL_PRINTING);
} else if (opt.search(/^def/) >= 0) {
log.warn('default printing ON');
Object.assign(opts, DEFAULT_PRINTING);
} else if (opt.search(/^prog/) >= 0) {
log.warn('programmer printing ON');
Object.assign(opts, PROGRAMMER_PRINTING);
} else throw new Error(`unrecognised printing opt ${opt}, try "minimal", "programmer" or "default"`);
}
/**
* @param {number} [n]
* @param {number} [m]
* @returns {string} string representation of the data frame
*/
toString(n = null, m = null) {
// in node debugger rows and columns are null
const termHeight = process.stdout.rows || 12;
const termWidth = process.stdout.columns || 50;
if (n === null) {
const newN = Math.min(this.length, termHeight - 7);
return this.toString(newN);
} else if (m === null) {
return this.toString(0, n);
} else if (n < 0) {
return this.toString(n + this.length, m);
} else if (m < 0) {
return this.toString(n, m + this.length);
} else if (n > this.length) {
log.warn(`${this.constructor.name}.toString(n = ${n}), but there is ${this.length} rows`);
return this.toString(this.length - n, this.length);
} else if (m > this.length) {
log.warn(`${this.constructor.name}.toString(m = ${m}), but there is ${this.length} rows`);
return this.toString(Math.max(0, this.length - (m - n)), this.length);
} else if (this.nCols === 0) {
return opts.EMPTY_STR;
}
const { dtypes } = this;
// always has the actual number of columns INCLUDING the optional index
const nCols = this.nCols + (opts.SHOW_INDEX ? 1 : 0);
let colWidth = opts.MIN_COL_WIDTH;
// e.g. for 3 cols you have 2 * SPACE_BETWEEN so always subtract a single SPACE_BETWEEN
const widthTaken = (this.nCols * (colWidth + opts.SPACE_BETWEEN)) - opts.SPACE_BETWEEN;
if (widthTaken < termWidth) {
const perCol = (termWidth - widthTaken) / nCols;
colWidth += Math.floor(perCol);
}
// this will be concatenated and displayed
const rows = [];
// index marker
const headerRow = opts.SHOW_INDEX ? [opts.IDX_MARKER] : [];
for (let cIdx = 0; cIdx < this.nCols; cIdx++) {
// col name could be an int so convert
let h = this.colNames[cIdx].toString();
// trunc column headings (if needed)
if (h.length > colWidth) {
h = h.slice(0, colWidth - opts.DOTS.length) + opts.DOTS;
}
if (opts.PRINT_TYPES) {
// pad to reserve space for opts.PAD_STR and dtype (injected later after col len is computed)
h = `${opts.PAD_STR.repeat(dtypes[cIdx].length)} ${h.toString()}`;
} else {
// no padding needed
h = h.toString();
}
headerRow.push(h);
}
rows.push(headerRow);
/*
* info about not displayed rows
* .. ... (2 more) .. .. <- THIS
* 3
* 4
* 5
*/
if (opts.SHOW_MORE && n > 0 && this.nCols > 0) {
const arr = Array(nCols).fill(opts.DOTS);
arr[0] = `(${n} more)`;
rows.push(arr);
}
// memoize
const numCols = this.numColIdxs;
/*
* now the actual content of the table
* remember about optional index (inject AFTER)!
*/
for (let rIdx = n; rIdx < m; rIdx++) {
// initialise
const row = Array(this.nCols).fill('');
const rs = this.rows;
for (let cIdx = 0; cIdx < this.nCols; cIdx++) {
const val = rs[rIdx][cIdx];
const s = val.toString();
const isNum = numCols.has(cIdx);
const isFloat = isNum && dtypes[cIdx].startsWith('f');
if (isFloat) {
row[cIdx] = fmtFloat(val, opts.PRINT_PREC);
continue;
}
const isStr = !isNum && isString(val);
const isTooLong = isStr && s.length > colWidth;
if (isTooLong) {
row[cIdx] = s.slice(0, colWidth - opts.DOTS.length) + opts.DOTS;
continue;
}
row[cIdx] = s;
}
if (opts.SHOW_INDEX) {
row.unshift(rIdx.toString(opts.INDEX_BASE));
}
rows.push(row);
}
if (this.length === 0) {
const emptyInfo = Array(this.nCols).fill(opts.EMPTY);
if (opts.SHOW_INDEX) {
emptyInfo.unshift('');
}
rows.push(emptyInfo);
} else if (opts.SHOW_MORE && m < this.length && this.nCols > 0) {
const arr = Array(nCols).fill(opts.DOTS);
arr[0] = `(${this.length - m} more)`;
rows.push(arr);
}
if (opts.MEM_INFO) {
// assume str
const memInfo = Array(this.nCols).fill(opts.MEM_INFO_STR);
for (let cIdx = 0; cIdx < this.nCols; cIdx++) {
const col = this.cols[cIdx];
// num column
if (col.memory !== undefined) {
memInfo[cIdx] = fmtFloatSI(col.memory(), 1, 'B');
}
}
/*
* different memory indicator for index
*(doesn't actually take any memory since it's just printed)
*/
if (opts.SHOW_INDEX) {
memInfo.unshift(opts.MEM_INFO_INDEX);
}
rows.push(memInfo);
}
// lengths of each column
const colWidths = Array(nCols)
.fill(0)
.map((_, cIdx) => rows
.map((r) => r[cIdx].toString().length)
.reduce((len1, len2) => Math.max(len1, len2), 1));
// inject underline AFTER col headings
rows.splice(1, 0, colWidths.map((l) => opts.UNDERLINE.repeat(l)));
// same for bottom but optional, the row before memory info (IF PRESENT)
if (opts.UNDERLINE_BOT) {
if (opts.MEM_INFO) {
rows.splice(rows.length - 1, 0, colWidths.map((l) => opts.UNDERLINE.repeat(l)));
} else {
rows.splice(rows.length, 0, colWidths.map((l) => opts.UNDERLINE.repeat(l)));
}
}
if (opts.PRINT_TYPES) {
if (opts.SHOW_INDEX) {
// inject dtypes for all headings
for (let cIdx = 1; cIdx < nCols; cIdx++) {
const len = colWidths[cIdx];
const dtype = dtypes[cIdx - 1];
const h = headerRow[cIdx].trim();
headerRow[cIdx] = dtype + opts.PAD_STR.repeat(len - h.length - dtype.length) + h;
}
// replace headerRow (without type labels) with typed
rows[0] = headerRow;
} else {
// inject dtypes for all headings
for (let cIdx = 0; cIdx < this.nCols; cIdx++) {
const len = colWidths[cIdx];
const dtype = dtypes[cIdx];
const h = headerRow[cIdx].trim();
headerRow[cIdx] = dtype + opts.PAD_STR.repeat(len - h.length - dtype.length) + h;
}
// replace headerRow (without type labels) with typed
rows[0] = headerRow;
}
}
// +1 for space between cols
const tooLong = colWidths.reduce((l1, l2) => l1 + l2 + opts.SPACE_BETWEEN, -opts.SPACE_BETWEEN) > termWidth;
if (tooLong) {
/*
* remove cols in the middle
*this ensures that there is n - 1 SPACE_BETWEEN for every n cols
*/
const nColsToShow = Math.floor((termWidth + opts.SPACE_BETWEEN) / (colWidth + opts.SPACE_BETWEEN));
/*
* C C C LEFT C C C RIGHT C C C
* should remove Cs on the left of LEFT and on the right of RIGHT
*C C C ... C C C
*/
const offset = Math.floor(nColsToShow / 2);
for (let rIdx = 0; rIdx < rows.length; rIdx++) {
let s = ''; // initialise
for (let cIdx = 0; cIdx < offset; cIdx++) {
s += rows[rIdx][cIdx].padStart(colWidths[cIdx] + opts.SPACE_BETWEEN, opts.PAD_STR);
}
s += opts.PAD_STR.repeat(opts.SPACE_BETWEEN) + opts.DOTS;
for (let cIdx = nCols - offset; cIdx < nCols; cIdx++) {
s += rows[rIdx][cIdx].padStart(colWidths[cIdx] + opts.SPACE_BETWEEN, opts.PAD_STR);
}
rows[rIdx] = s;
}
} else {
// display all (it fits on the screen)
for (let rIdx = 0; rIdx < rows.length; rIdx++) {
rows[rIdx] = rows[rIdx].map((val, cIdx) => val.padStart(colWidths[cIdx], opts.PAD_STR))
.join(opts.PAD_STR.repeat(opts.SPACE_BETWEEN));
}
}
return rows.join('\n');
}
[util.inspect.custom](depth, options) { return this.toString(); }
}
module.exports = DataFrame;