public class SequenceUtils extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
ALL_DNA_CHARS |
static java.lang.String |
ALL_RNA_CHARS |
static java.lang.String |
DNA_CHARS |
static char |
GAP_CHAR |
static char |
MATCH_CHAR |
static char |
MISSING_DATA_CHAR |
static java.lang.String |
RNA_CHARS |
static char |
STOP_CODON_CHAR |
Constructor and Description |
---|
SequenceUtils() |
Modifier and Type | Method and Description |
---|---|
static char |
aminoAcidConsensus(java.lang.String[] alignmentColumn) |
static java.util.Map<java.lang.Character,java.lang.Double> |
aminoAcidFrequencies(java.lang.String[] alignmentColumn) |
static char |
complement(char c)
Returns the complement of the specified character.
|
static java.lang.String |
complement(java.lang.CharSequence sequence)
Returns the complementing sequence.
|
static java.lang.String |
deleteAllGaps(java.lang.CharSequence seq)
Returns the specified sequence without any gaps.
|
static java.lang.String |
deleteFromLeft(java.lang.String seq,
int count)
Deletes
count characters from the left side of the sequence which are not gaps
("-"). |
static java.lang.String |
deleteFromRight(java.lang.String seq,
int count)
Deletes
count characters from the left side of the sequence which are not gaps
("-"). |
static java.lang.String |
deleteGapsFromLeft(java.lang.String seq,
int count)
Deletes the specified number of gaps from the sequence starting on the left side.
|
static java.lang.String |
deleteGapsFromRight(java.lang.String seq,
int count)
Deletes the specified number of gaps from the sequence starting on the right side.
|
static java.lang.String |
deleteLeadingGaps(java.lang.String seq)
Deletes all leading gaps from a sequence.
|
static java.lang.String |
deleteLeadingTrailingGaps(java.lang.String seq)
Deletes all leading and trailing gaps of the specified sequence.
|
static java.lang.String |
deleteTrailingGaps(java.lang.String seq)
Deletes all trailing gaps from a sequence.
|
static java.lang.String |
dnaToRNA(java.lang.String dna) |
static java.util.Set<java.lang.Character> |
getAminoAcidOneLetterCodes(boolean includeAmbiguity)
Returns a set of all amino acid one letter codes in upper case.
|
static java.util.Set<java.lang.String> |
getAminoAcidThreeLetterCodes(boolean includeAmbiguity)
Returns a set of all amino acid three letter codes in upper case.
|
static java.util.Set<java.lang.Character> |
getNucleotideCharacters()
Returns a set of all nucleotide characters, including 'T' and 'U' as well as all IUPAC
ambiguity codes in upper case.
|
static boolean |
isAminoAcidAmbiguityCode(java.lang.String code)
Determines whether the specified character is an amino acid ambiguity code.
|
static boolean |
isDNAChar(char c) |
static boolean |
isInTokenSet(char c,
java.lang.String tokens) |
static boolean |
isNonAmbiguityAminoAcid(java.lang.String code) |
static boolean |
isNonAmbiguityNucleotide(char c) |
static boolean |
isNucleotideAmbuguityCode(char nucleotide)
Determines whether the specified character is an IUPAC ambiguity code.
|
static boolean |
isRNAChar(char c) |
static java.lang.String |
leftSubsequence(java.lang.String seq,
int count)
Returns the left subsequence with the specified length (which does not include gaps).
|
static int |
lengthWOGaps(java.lang.CharSequence sequence) |
static char |
nucleotideConsensus(char[] alignmentColumn)
Returns the nucleotide that occurs most often in the specified alignment row.
|
static char[] |
nucleotideConstituents(char nucleotide)
If the specified nucleotide is an IUPAC ambiguity code this method returns an array
containing all nucleotides that could be represented by the code.
|
static java.util.Map<java.lang.Character,java.lang.Double> |
nucleotideFrequencies(char[] alignmentColumn)
Counts the nucleotide frequencies in the specified alignment column.
|
static char |
oneLetterAminoAcidByThreeLetter(java.lang.String threeLetterCode)
Converts the specified three letter amino acid code into a one letter representation.
|
static char[] |
oneLetterAminoAcidConstituents(java.lang.String code)
Returns the one letter amino acid representations of all compounds that could be represented by the
specified ambiguity code.
|
static java.lang.String |
randSequence(boolean dna,
int length,
double rateCG)
Returns a sequence of random DNA or RNA characters.
|
static java.lang.String |
randSequence(boolean dna,
int length,
double rateC,
double rateG,
double rateA)
Returns a sequence of random DNA or RNA characters.
|
static java.lang.String |
reverse(java.lang.CharSequence sequence)
Returns the reverse of the specified sequence.
|
static java.lang.String |
reverseComplement(java.lang.CharSequence sequence)
Returns the reverse complemented sequence.
|
static java.lang.String |
rightSubsequence(java.lang.String seq,
int count)
Returns the right subsequence with the specified length (which does not include gaps).
|
static char[] |
rnaConstituents(char nucleotide) |
static java.lang.String |
rnaToDNA(java.lang.String rna) |
static java.lang.String |
threeLetterAminoAcidByOneLetter(char oneLetterCode)
Converts the specified one letter amino acid code into a three letter representation.
|
static java.lang.String[] |
threeLetterAminoAcidConstituents(java.lang.String code)
Returns the three letter amino acid representations of all compounds that could be represented by the
specified ambiguity code.
|
public static final char GAP_CHAR
public static final char MISSING_DATA_CHAR
public static final char MATCH_CHAR
public static final char STOP_CODON_CHAR
public static final java.lang.String DNA_CHARS
public static final java.lang.String ALL_DNA_CHARS
public static final java.lang.String RNA_CHARS
public static final java.lang.String ALL_RNA_CHARS
public SequenceUtils()
public static java.util.Set<java.lang.Character> getNucleotideCharacters()
public static char[] nucleotideConstituents(char nucleotide)
Constituents returned for ambiguity code are always DNA nucleotides (thymine is always used instead
of uracil). (Anyway, if 'U'
is specified for nucleotide
the returned array will contain
'U'
as the only element and not 'T'
).
If the specified character is not an ambiguity code character, an empty array is returned.
nucleotide
- the character that may be an ambiguity codepublic static char[] rnaConstituents(char nucleotide)
public static boolean isNonAmbiguityNucleotide(char c)
public static boolean isNucleotideAmbuguityCode(char nucleotide)
nucleotide
- the character that may be an ambiguity codetrue
of the specified character is a valid ambiguity code, false
otherwise.public static java.util.Set<java.lang.Character> getAminoAcidOneLetterCodes(boolean includeAmbiguity)
includeAmbiguity
- Specify true
here if amino acid ambiguity codes shall also be
contained in the returned set or false
if only unambiguous characters shall be
contained.public static java.util.Set<java.lang.String> getAminoAcidThreeLetterCodes(boolean includeAmbiguity)
includeAmbiguity
- Specify true
here if amino acid ambiguity codes shall also be
contained in the returned set or false
if only unambiguous characters shall be
contained.public static char oneLetterAminoAcidByThreeLetter(java.lang.String threeLetterCode)
"---"
would be converted to '-'
).threeLetterCode
- the three letter code to be convertedjava.lang.IllegalArgumentException
- if the specified code is not a valid one letter amino acid code or a three
character long repetition of the same characterpublic static java.lang.String threeLetterAminoAcidByOneLetter(char oneLetterCode)
oneLetterCode
- the one letter code to be converted"Pro"
)
if oneLetterCode
was a valid amino acid representation or a string consisting of three
repetitions of the specified character otherwise (E.g. '-' would be converted to "---".)public static char[] oneLetterAminoAcidConstituents(java.lang.String code)
code
- the ambiguity one or three letter ambiguity code to be convertedcode
was not a valid ambiguity codepublic static java.lang.String[] threeLetterAminoAcidConstituents(java.lang.String code)
code
- the ambiguity one or three letter ambiguity code to be convertedcode
was not a valid ambiguity codepublic static boolean isNonAmbiguityAminoAcid(java.lang.String code)
public static boolean isAminoAcidAmbiguityCode(java.lang.String code)
code
- the string that may be an ambiguity code (one and three letter codes are supported)true
of the specified character is a valid amino acid ambiguity code, false
otherwise.public static java.lang.String reverse(java.lang.CharSequence sequence)
sequence
- the source sequencepublic static char complement(char c)
c
- the nucleotide to be complementedc
public static java.lang.String complement(java.lang.CharSequence sequence)
sequence
- the sequence to be complementedpublic static java.lang.String reverseComplement(java.lang.CharSequence sequence)
sequence
- the sequence to be reverse complementedpublic static java.lang.String rnaToDNA(java.lang.String rna)
public static java.lang.String dnaToRNA(java.lang.String dna)
public static boolean isDNAChar(char c)
public static boolean isRNAChar(char c)
public static boolean isInTokenSet(char c, java.lang.String tokens)
public static int lengthWOGaps(java.lang.CharSequence sequence)
public static java.lang.String leftSubsequence(java.lang.String seq, int count)
If there are less than count
characters in seq
the whole sequence is
returned.
Example: leftSubsequence("AT-A-GCCTG-CTG", 5)
would return AT-A-GC
seq
- the source sequencecount
- the number characters that shall be contained in the returned subsequence (not
including gaps)public static java.lang.String rightSubsequence(java.lang.String seq, int count)
If there are less than count
characters in seq
the whole sequence is
returned.
Example: rightSubsequence("AT-A-GCCTG-CTG", 5)
would return TG-CTG
seq
- the source sequencecount
- the number characters that shall be contained in the returned subsequence (not
including gaps)public static java.lang.String deleteFromLeft(java.lang.String seq, int count)
count
characters from the left side of the sequence which are not gaps
("-").seq
- the original sequencecount
- the number of characters to be removedpublic static java.lang.String deleteFromRight(java.lang.String seq, int count)
count
characters from the left side of the sequence which are not gaps
("-").seq
- the original sequencecount
- the number of characters to be removedpublic static java.lang.String deleteLeadingGaps(java.lang.String seq)
seq
- public static java.lang.String deleteTrailingGaps(java.lang.String seq)
seq
- public static java.lang.String deleteGapsFromLeft(java.lang.String seq, int count)
seq
- the sequence that contains the gapscount
- the number of gaps to be removedpublic static java.lang.String deleteGapsFromRight(java.lang.String seq, int count)
seq
- the sequence that contains the gapscount
- the number of gaps to be removedpublic static java.lang.String deleteLeadingTrailingGaps(java.lang.String seq)
seq
- public static java.lang.String deleteAllGaps(java.lang.CharSequence seq)
seq
- public static java.lang.String randSequence(boolean dna, int length, double rateC, double rateG, double rateA)
dna
- determines whether a DNA sequence shall be returned (RNA id false
)length
- the length of the sequencerateC
- the rate for cytosinerateG
- the rate for guaninerateA
- the rate for adeninepublic static java.lang.String randSequence(boolean dna, int length, double rateCG)
dna
- determines whether a DNA sequence shall be returned (RNA id false
)length
- the length of the sequencerateCG
- the rate for cytosine and guanine (must lower than 1)public static java.util.Map<java.lang.Character,java.lang.Double> nucleotideFrequencies(char[] alignmentColumn)
IUPAC ambiguity codes are supported and counted accordingly for several nucleotides. (Examples: N would be counted as 0.25 for each nucleotide, R would be counted as 0.5 for C and 0.5 for T.)
alignmentColumn
- the contents of the alignment column from which the consensus shall be calculatedpublic static char nucleotideConsensus(char[] alignmentColumn)
alignmentColumn
- the contents of the alignment column from which the consensus shall be calculatedpublic static java.util.Map<java.lang.Character,java.lang.Double> aminoAcidFrequencies(java.lang.String[] alignmentColumn)
public static char aminoAcidConsensus(java.lang.String[] alignmentColumn)