API Docs for: 0.1.0
Show:

File: kmer.js

'use strict';

const Type      = require('type-of-is');
const through2  = require('through2');
const BigNumber = require('bignumber.js');

/**
 * Provides the Kmer class
 * 
 * @module KmerJS
 * @main
 */


/**
 * Kmer.js is a module for calculating kmer frequencies and sequence probabilities
 * 
 * @class Kmer
 * @module KmerJS
 * @constructor
 */
class Kmer {
    /**
     * The constructor method for instantiation
     *
     * @method constructor
     * @param {Number} k A value of k to generate the profile and calculate stats with
     * @param {String} letters A string of characters for the alphabet of the sequences. Defaults to "ACGT";
     * @throws {TypeError} If k is not a Number
     * @throws {TypeError} If K is a float
     * @throws {TypeError} If letters is not a String
     * @throws {TypeError} If the number of letters is not > 1
     * @example
     *     Kmer = require('kmer.js')
     *     sampleA = new Kmer(9, "ACGT");
     */
    constructor(k, letters){
	// Typecheck the args
	var alphabet = letters || "ACGT";
	if (!Type.is(k, Number)) throw TypeError("kmerJS takes an integer as its first positional argument");
	else if (!isNaN(k) && k.toString().indexOf('.') != -1) throw TypeError("kmerJS takes an integer as its first positional argument. This was a float");
	else if (!Type.is(alphabet, String)) throw TypeError("kmerJS expects the optional second positional argument to be a String");
	else if (alphabet.length <= 1) throw TypeError("kmerJS expects the optional second positional argument to have a length > 1");

	
	//  Instance variables
	/**
	 * Choice of k used to instantiate, used throughout the instance methods
	 * 
	 * @property k
	 * @type Number
	 */
	this.k = k;
	/**
	 * The 'alphabet' of sequence characters to use
	 * 
	 * @property alphabet
	 * @type String
	 * @default "ACGT"
	 */
	this.alphabet = alphabet;
	/**
	 * A regular expression to match characters not belonging to the alphabet
	 * 
	 * @property notInAlphabet
	 * @type RegExp	 
	 */
	this.notInAlphabet = new RegExp("[^" + this.alphabet + "]");
	/***
	 * A mapping of letters to their binary encodings (integer indices)
	 * 
	 * @property letterToBinary
	 * @type Object
	 */
	this.letterToBinary = this.alphabet.split('').reduce(function(acc, item, i){
	    acc[item] = i;
	    return acc;
	}, {});
	/**
	 * A mapping of binary encoded sequences to their corresponding Strings
	 * 
	 * @property binaryToLetter
	 * @type Array
	 */
	this.binaryToLetter = this.alphabet.split('');
	/**
	 * A profile of kmer counts
	 * 
	 * @property profile
	 * @type Uint32Array
	 */
	this.profile = this.profileAsArray(k, letters);
	/**
	 * A total of counts from an instantiated profile
	 * Relies on data being loaded through the streamingUpdate() method
	 * Recalculate after updating by running the TotalProfileCounts() method
	 * 
	 * @property totalProfileCounts
	 * @type BigNumber
	 */
	this.totalProfileCounts = this.TotalProfileCounts();
	// Necessary enforced binding of context because... through2 changes scope?
	this.update = this.Update.bind(this);

    }


    /**
     * Returns a 32-bit int array of zeroes, given an alphabet and choice of k
     *
     * @method profileAsArray
     * @param {Number} k An integer value with which to generate substrings
     * @param {String} letters An optional string of letters, effectively the 'alphabet'. Defaults to 'ACGT'
     * @throws {TypeError} If k is not a Number
     * @throws {TypeError} If k is a float
     * @throws {TypeError} If letters is not a String
     * @throws {TypeError} If the number of letters is not > 1
     * @return {Uint32Array} Returns a typed array of length  : letters.length ^ k
     */
    profileAsArray(k, letters){
	var alphabet = letters || "ACGT";
	if (!Type.is(k, Number)) throw TypeError("kmer.profileAsArray takes a Number as its first positional argument");
	else if (!isNaN(k) && k.toString().indexOf('.') != -1) throw TypeError("kmer.profileAsArray takes an integer as its first positional argument. This was a float");
	else if (!Type.is(alphabet, String)) throw TypeError("kmer.profileAsObject expects the optional second positional argument to be a String");
	else if (alphabet.length <= 1) throw TypeError("kmer.profileAsObject expects the optional second positional argument to have a length > 1");
	else {
	    var numKmers = Math.pow(alphabet.length, k);
	    return new Uint32Array(numKmers);
	}
    }


    /**
     *  Returns an array of all k-length substrings. Takes a string and a k in that order.
     * 
     * @method kmerArray
     * @param {String} s A string to slice into kmers
     * @param {Number} k An integer length for all resulting substrings
     * @throws {TypeError} If s is not a String
     * @throws {TypeError} If k is not a Number
     * @throws {TypeError} If k is a float
     * @throws {TypeError} If the length of s < k
     * @return {Array<String>} Returns an array of Strings, all of length k, and all substrings of s.
     * 
     * @example
     *     >var k = 7
     *     >var testString = "AAACCCCCGCACCCGCGGGGGTTTCAGCGTGTCG"
     *     >var allKmersFromTestString = kmer.kmerArray(testString, 9)
     */
    kmerArray(s, k){
	if (!Type.is(s, String)) throw TypeError("kmer.kmerArray takes a String as its first positional argument");
	else if (!Type.is(k, Number)) throw TypeError("kmer.kmerArray takes an integer as its second and final positional argument");
	else if (!isNaN(k) && k.toString().indexOf('.') != -1) throw TypeError("kmer.kmerArray takes an integer as its second and final positional argument. This was a float");
	else if (s.length < k) throw TypeError("kmer.kmerArray takes a String whose length > k as its first positional argument");
	else {
	    return Array(s.length).fill(undefined).map(function(_, i){
		let r=s.substring(i, i+k);
		if (r.length == k){
		    return r;
		} else {
		    return undefined;
		}
	    }).filter((x) => x !== undefined);
	}
    };

    
    /**
     * Updates the profile by pure side-effect. No return value
     *
     * @method update
     * @param {String} seq A String with letters matching the pre-specified alphabet
     * @throws {TypeError} If seq is not a String
     * @throws {TypeError} If the number of characters in seq < k
     * @throws {TypeError} If there are letters in seq that aren't in the sequence alphabet
     */
    Update(seq, thisArg){
	//console.log("this:", this)
	var k = this.k;
	//console.log("thisArg:", thisArg)
	if (!Type.is(seq, String)) throw TypeError("kmer.update takes a String as its only positional argument");
	else if (seq.length < this.k) throw TypeError(`kmer.update takes a String with length greater than ${this.k} as its only positional argument`);
	else if (seq.match(this.notInAlphabet)) throw new TypeError(`kmer.update takes a String with letters from the alphabet '${this.alphabet}'`);
	else {
	    let substrings = this.kmerArray(seq, this.k);
	    while (substrings.length > 0){
		this.profile[this.sequenceToBinary(substrings.pop())] += 1;
	    }
	}
    };



    /**
     * This method streams sequence data to update the kmer profile by side-effect
     *
     * @method streamingUpdate
     * @throws {TypeError} If the input stream doesn't yield objects
     * @throws {TypeError} If the input stream's objects don't have a .seq attribute
     * @return {through2} A through2 stream wrapper
     * 
     * @example
     *     >var fasta = require('bionode-fasta'); // Bionode-fastq also
     *     >var fs = require('fs');
     *     >fs.createReadStream("/path/to/example.fasta", {encoding: "UTF-8"})
     *        .pipe(fasta.obj())
     *        .pipe(kmer.streamingUpdate())
     *        .on('finish', function(){
     *            console.log("Done!");
     *        });
     *     >console.log(kmer.profile)
     * @example
     *     >var AWS = require('aws-sdk');
     *     >var s3 = new AWS.S3({apiVersion: '2006-03-01'});
     *     >var fasta = require('bionode-fasta');
     *     >var params = {Bucket: 'bucketname', Key: 'path/to/example.fasta'}
     *     >s3.getObject(params).createReadStream()
     *        .pipe(fasta.obj())
     *        .pipe(kmer.streamingUpdate())
     *        .on('finish', function(){
     *            console.log("Done!");
     *        });
     *     >console.log(kmer.profile);
     */
    streamingUpdate(){
	var update = this.update;
	var thisArg = this;
	return through2.obj(function(data, enc, callback){
	    if (!Type.is(data, Object)) throw TypeError("kmer.streamingUpdate expects the pipe to produce objects");
	    else if (! ("seq" in data && Type.is(data.seq, String))) throw TypeError("kmer.streamingUpdate expects the pipe to produce objects with a 'seq' attribute. See 'bionode-fasta' for examples.");
	    else {
		update(data.seq, thisArg);
		callback();
	    }
	});
    }

    /**
     * Returns a binary representation/encoding of a biological sequence
     *
     * @method sequenceToBinary
     * @param {String} s A biological sequence to convert into a binary integer
     * @throws {TypeError} If s is not a String
     * @throws {TypeError} If there are letters in seq that aren't in the sequence alphabet
     * @return {Number} Returns an integer encoding of a k-mer
     * 
     * @example
     *     >var testKmer = "AAAAAAAAA" // Length of testKmer matches our initial value of k, 9
     *     >var testKmerIndex = kmer.sequenceToBinary(testKmer);
     *     >console.log( kmer.profile[testIndex] );
     */
    sequenceToBinary(s){
	if (!Type.is(s, String)) throw TypeError("kmer.sequenceToBinary takes a String as its only positional argument");
	else if (s.match(this.notInAlphabet)) throw TypeError("kmer.sequenceToBinary takes a String with letters in the specified alphabet as its only positional argument");
	else {
	    var result = 0x00;
	    for (var i in s){
		result = result << 2;
		result = result | this.letterToBinary[s[i]];
	    }
	    return result;
	}
    }

    /**
     * Returns a biological sequence from a binary encoding
     *
     * @method binaryToSequence
     * @param {Number} x An integer encoding of a biological sequence
     * @throws {TypeError} If x is not a Number
     * @throws {TypeError} If x is a float
     * @return {String} A biological sequence
     *
     * @example
     *     >var testKmerIndex = 0; // Analogous to sequenceToBinary() example
     *     >kmer.binaryToSequence(testKmerIndex);
     *     'AAAAAAAAA'
     */
    binaryToSequence(x){
	if (!Type.is(x, Number)) throw TypeError("kmer.binaryToSequence takes a Number as its only positional argument");
	else if (!isNaN(x) && x.toString().indexOf('.') != -1) throw TypeError("kmer.binaryToSequence takes an integer as its only positional argument. This was a float");
	else {

	    var result = "";
	    for (var i = 0; i < this.k; i++) {
		result += this.binaryToLetter[x & 0x03];
		x = x >> 2;
	    }
	    return result.split('').reverse().join('');
	}
    };

    /**
     * Sums the counts for the whole profile. 
     * It also updates the associated property .totalProfileCounts as a side-effect
     * 
     * @method TotalProfileCounts
     * @return {BigNumber} Returns a BigNumber.js sum of all counts from the profile array
     * 
     * @example
     *     >// After a streaming update, the attribute .totalProfileCounts isn't always updated
     *     >console.log( kmer.totalProfileCounts );
     *     0
     *     >kmer.TotalProfileCounts();
     *     10300
     *     >console.log( kmer.totalProfileCounts );
     *     10300
     */
    TotalProfileCounts(){
	this.totalProfileCounts = this.profile.reduce((a, b) => BigNumber(a).plus(BigNumber(b)));
	return this.totalProfileCounts;
    }

    /**
     * Calculates a frequency of a sequence in the profile
     * 
     * @method frequency
     * @param {String} seq A sequence to retrieve the relative count/frequency
     * @throws {TypeError} If seq is not a String
     * @throws {TypeError} If seq is not a kmer (has a length of k)
     * @return {BigNumber} The frequency (count/totalCounts) of the sequence from the profile
     *
     * @example
     *     >var testKmer = "AAAAAAAAA";
     *     >var testKmerFrequency = kmer.frequency(testKmer); // Returns a BigNumber.js
     *     >console.log( testKmerFrequency.toNumber() );
     *     0.123457
     */
    frequency(seq){
	if (!Type.is(seq, String)) throw TypeError("kmer.frequency takes a String as its only positional argument");
	else if (seq.length != this.k) throw TypeError(`kmer.frequency takes a String with length ${this.k} as its only positional argument`);
	else return new BigNumber(this.profile[this.sequenceToBinary(seq)]).div(this.totalProfileCounts);
    }

    /**
     * Calculates the transition probability of one sequence to the next in a Markov chain
     * 
     * @method transitionProbability
     * @param {String} seq1 
     * @param {String} seq2 
     * @throws {TypeError} If seq1 is not a String
     * @throws {TypeError} If seq1 is not a kmer (has a length of k)
     * @throws {TypeError} If seq2 is not a String
     * @throws {TypeError} If seq2 is not a kmer (has a length of k)
     * @return {BigNumber} The transition probability of seq1 to seq2
     * 
     * @example
     *     >var testKmer1 = "AAAAAAAAA";
     *     >var testKmer2 = "AAAAAAAAT";
     *     >var transProbTK1toTK2 = kmer.transitionProbability(testKmer1, testKmer2);
     *     >console.log( transProbTK1toTK2.toNumber() );
     *     0.111048
     */
    transitionProbability(seq1, seq2){
	this.TotalProfileCounts();
	if (!Type.is(seq1, String)) throw TypeError("kmer.transitionProbability takes a String as its first positional argument");
	else if (!Type.is(seq2, String)) throw TypeError("kmer.transitionProbability takes a String as its second positional argument");
	else if (seq1.length != this.k) throw TypeError(`kmer.transitionProbability takes a sequence of length ${this.k} as its first positional argument`);
	else if (seq2.length != this.k) throw TypeError(`kmer.transitionProbability takes a sequence of length ${this.k} as its second positional argument`);
	else {
	    let suffix1 = seq1.substring(1, seq1.length);
	    let prefix2 = seq2.substring(0, seq2.length - 1);
	    if (suffix1 != prefix2) return new BigNumber(0);
	    else return this.frequency(seq2).div(this.alphabet.split('').map((c) => this.frequency(suffix1 + c)).reduce((a, b) => a.plus(b)));
	}
    }

    /**
     * Calculates the Markov chain probability of a sequence from its transition probabilities
     * 
     * @method probabilityOfSequence
     * @param {String} seq A biological sequence
     * @throws {TypeError} If seq is not a String
     * @throws {TypeError} If seq is not larger than a kmer (has a length > k)
     * @throws {TypeError} If there are letters in seq that aren't in the sequence alphabet
     * @return {BigNumber} Returns the Markov-chain probability of the input sequence
     * 
     * @example
     *     >var testKmer = "AAACCCCCGCACCCGCGGGGGTTTCAGCGTGTCG";
     *     >var testKmerProb = kmer.probabilityOfSequence(testKmer);
     *     >console.log( testKmerProb.toNumber() );
     *     0.000033333888887777710
     */
    probabilityOfSequence(seq){
	if (!Type.is(seq, String)) throw TypeError("kmer.probabilityOfSequence takes a String as its only positional argument");
	else if (seq.length <= this.k) throw TypeError("kmer.probabilityOfSequence takes a String with length greater than " + this.k + " as its only positional argument");
	else if (seq.match(this.notInAlphabet)) throw TypeError(`kmer.probabilityOfSequence takes a String with letters from the alphabet '${this.alphabet}'`);
	else {
	    let substrings = this.kmerArray(seq, this.k);
	    let p = 1;
	    for (var i = 0; i < (substrings.length - 1); i++) {
		//console.log("Transition for:", substrings[i], substrings[i+1]);
		//console.log("Current accumulator:", p);
	    	p = this.transitionProbability(substrings[i], substrings[i+1]).times(p);
	    }
	    return p;

	}
    }
}


    

module.exports = Kmer;