class Bio::Fastq

Bio::Fastq is a parser for FASTQ format.

Constants

DefaultFormatName

Default format name

FLATFILE_SPLITTER

Splitter for Bio::FlatFile

FormatNames

Available format names.

Formats

Available format name symbols.

Attributes

definition[R]

definition; ID line (begins with @)

entry_overrun[R]
header[R]

misc lines before the entry (String or nil)

quality_string[R]

quality as a string

sequence_string[R]

raw sequence data as a String object

Public Class Methods

new(str = nil) click to toggle source

Creates a new Fastq object from formatted text string.

The format of quality scores should be specified later by using format= method.


Arguments:

  • str: Formatted string (String)

    # File lib/bio/db/fastq.rb
383 def initialize(str = nil)
384   return unless str
385   sc = StringScanner.new(str)
386   while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
387     unless add_header_line(line) then
388       sc.unscan
389       break
390     end
391   end
392   while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
393     unless add_line(line) then
394       sc.unscan
395       break
396     end
397   end
398   @entry_overrun = sc.rest
399 end

Public Instance Methods

add_header_line(line) click to toggle source

Adds a header line if the header data is not yet given and the given line is suitable for header. Returns self if adding header line is succeeded. Otherwise, returns false (the line is not added).

    # File lib/bio/db/fastq.rb
324 def add_header_line(line)
325   @header ||= ""
326   if line[0,1] == "@" then
327     false
328   else
329     @header.concat line
330     self
331   end
332 end
add_line(line) click to toggle source

Adds a line to the entry if the given line is regarded as a part of the current entry.

    # File lib/bio/db/fastq.rb
339 def add_line(line)
340   line = line.chomp
341   if !defined? @definition then
342     if line[0, 1] == "@" then
343       @definition = line[1..-1]
344     else
345       @definition = line
346       @parse_errors ||= []
347       @parse_errors.push Error::No_atmark.new
348     end
349     return self
350   end
351   if defined? @definition2 then
352     @quality_string ||= ''
353     if line[0, 1] == "@" and
354         @quality_string.size >= @sequence_string.size then
355       return false
356     else
357       @quality_string.concat line
358       return self
359     end
360   else
361     @sequence_string ||= ''
362     if line[0, 1] == '+' then
363       @definition2 = line[1..-1]
364     else
365       @sequence_string.concat line
366     end
367     return self
368   end
369   raise "Bug: should not reach here!"
370 end
entry_id() click to toggle source

Identifier of the entry. Normally, the first word of the ID line.

    # File lib/bio/db/fastq.rb
446 def entry_id
447   unless defined? @entry_id then
448     eid = @definition.strip.split(/\s+/)[0] || @definition
449     @entry_id = eid
450   end
451   @entry_id
452 end
error_probabilities() click to toggle source

Estimated probability of error for each base.


Returns

(Array containing Float) error probability values

    # File lib/bio/db/fastq.rb
529 def error_probabilities
530   unless defined? @error_probabilities then
531     self.format ||= self.class::DefaultFormatName
532     a = @format.q2p(self.quality_scores)
533     @error_probabilities = a
534   end
535   @error_probabilities
536 end
format() click to toggle source

Format name. One of “fastq-sanger”, “fastq-solexa”, “fastq-illumina”, or nil (when not specified).


Returns

(String or nil) format name

    # File lib/bio/db/fastq.rb
497 def format
498   ((defined? @format) && @format) ? @format.name : nil
499 end
format=(name) click to toggle source

Specify the format. If the format is not found, raises RuntimeError.

Available formats are:

"fastq-sanger" or :fastq_sanger
"fastq-solexa" or :fastq_solexa
"fastq-illumina" or :fastq_illumina

Arguments:

  • (required) name: format name (String or Symbol).

Returns

(String) format name

    # File lib/bio/db/fastq.rb
476 def format=(name)
477   if name then
478     f = FormatNames[name] || Formats[name]
479     if f then
480       reset_state
481       @format = f.instance
482       self.format
483     else
484       raise "unknown format"
485     end
486   else
487     reset_state
488     nil
489   end
490 end
mask(threshold, mask_char = 'n') click to toggle source

Masks low quality sequence regions. For each sequence position, if the quality score is smaller than the threshold, the sequence in the position is replaced with mask_char.

Note: This method does not care quality_score_type.


Arguments:

  • (required) threshold : (Numeric) threshold

  • (optional) mask_char : (String) character used for masking

Returns

Bio::Sequence object

    # File lib/bio/db/fastq.rb
668 def mask(threshold, mask_char = 'n')
669   to_biosequence.mask_with_quality_score(threshold, mask_char)
670 end
nalen() click to toggle source

length of naseq

    # File lib/bio/db/fastq.rb
433 def nalen
434   naseq.length
435 end
naseq() click to toggle source

returns Bio::Sequence::NA

    # File lib/bio/db/fastq.rb
425 def naseq
426   unless defined? @naseq then
427     @naseq = Bio::Sequence::NA.new(@sequence_string)
428   end
429   @naseq
430 end
qualities()
Alias for: quality_scores
quality_score_type() click to toggle source

The meaning of the quality scores. It may be one of :phred, :solexa, or nil.

    # File lib/bio/db/fastq.rb
504 def quality_score_type
505   self.format ||= self.class::DefaultFormatName
506   @format.quality_score_type
507 end
quality_scores() click to toggle source

Quality score for each base. For “fastq-sanger” or “fastq-illumina”, it is PHRED score. For “fastq-solexa”, it is Solexa score.


Returns

(Array containing Integer) quality score values

    # File lib/bio/db/fastq.rb
515 def quality_scores
516   unless defined? @quality_scores then
517     self.format ||= self.class::DefaultFormatName
518     s = @format.str2scores(@quality_string)
519     @quality_scores = s
520   end
521   @quality_scores
522 end
Also aliased as: qualities
seq() click to toggle source

returns Bio::Sequence::Generic

    # File lib/bio/db/fastq.rb
438 def seq
439   unless defined? @seq then
440     @seq = Bio::Sequence::Generic.new(@sequence_string)
441   end
442   @seq
443 end
to_biosequence() click to toggle source

Returns sequence as a Bio::Sequence object.

Note: If you modify the returned Bio::Sequence object, the sequence or definition in this Fastq object might also be changed (but not always be changed) because of efficiency.

    # File lib/bio/db/fastq.rb
653 def to_biosequence
654   Bio::Sequence.adapter(self, Bio::Sequence::Adapter::Fastq)
655 end
to_s() click to toggle source

Returns Fastq formatted string constructed from instance variables. The string will always be consisted of four lines without wrapping of the sequence and quality string, and the third-line is always only contains “+”. This may be different from initial entry.

Note that use of the method may be inefficient and may lose performance because new string object is created every time it is called. For showing an entry as-is, consider using Bio::FlatFile#entry_raw. For output with various options, use Bio::Sequence#output(:fastq).

    # File lib/bio/db/fastq.rb
420 def to_s
421   "@#{@definition}\n#{@sequence_string}\n+\n#{@quality_string}\n"
422 end
validate_format(errors = nil) click to toggle source

Format validation.

If an array is given as the argument, when errors are found, error objects are pushed to the array. Currently, following errors may be added to the array. (All errors are under the Bio::Fastq namespace, for example, Bio::Fastq::Error::Diff_ids).

Error::Diff_ids – the identifier in the two lines are different Error::Long_qual – length of quality is longer than the sequence Error::Short_qual – length of quality is shorter than the sequence Error::No_qual – no quality characters found Error::No_seq – no sequence found Error::Qual_char – invalid character in the quality Error::Seq_char – invalid character in the sequence Error::Qual_range – quality score value out of range Error::No_ids – sequence identifier not found Error::No_atmark – the first identifier does not begin with “@” Error::Skipped_unformatted_lines – the parser skipped unformatted lines that could not be recognized as FASTQ format


Arguments:

  • (optional) errors: (Array or nil) an array for pushing error messages. The array should be empty.

Returns

true:no error, false: containing error.

    # File lib/bio/db/fastq.rb
562 def validate_format(errors = nil)
563   err = []
564 
565   # if header exists, the format might be broken.
566   if defined? @header and @header and !@header.strip.empty? then
567     err.push Error::Skipped_unformatted_lines.new
568   end
569 
570   # if parse errors exist, adding them
571   if defined? @parse_errors and @parse_errors then
572     err.concat @parse_errors
573   end
574 
575   # check if identifier exists, and identifier matches
576   if !defined?(@definition) or !@definition then
577     err.push Error::No_ids.new
578   elsif defined?(@definition2) and
579       !@definition2.to_s.empty? and
580       @definition != @definition2 then
581     err.push Error::Diff_ids.new
582   end
583 
584   # check if sequence exists
585   has_seq  = true
586   if !defined?(@sequence_string) or !@sequence_string then
587     err.push Error::No_seq.new
588     has_seq = false
589   end
590 
591   # check if quality exists
592   has_qual = true
593   if !defined?(@quality_string) or !@quality_string then
594     err.push Error::No_qual.new
595     has_qual = false
596   end
597 
598   # sequence and quality length check
599   if has_seq and has_qual then
600     slen = @sequence_string.length
601     qlen = @quality_string.length
602     if slen > qlen then
603       err.push Error::Short_qual.new
604     elsif qlen > slen then
605       err.push Error::Long_qual.new
606     end
607   end
608 
609   # sequence character check
610   if has_seq then
611     sc = StringScanner.new(@sequence_string)
612     while sc.scan_until(/[ \x00-\x1f\x7f-\xff]/n)
613       err.push Error::Seq_char.new(sc.pos - sc.matched_size)
614     end
615   end
616 
617   # sequence character check
618   if has_qual then
619     fmt = if defined?(@format) and @format then
620             @format.name
621           else
622             nil
623           end
624     re = case fmt
625          when 'fastq-sanger'
626            /[^\x21-\x7e]/n
627          when 'fastq-solexa'
628            /[^\x3b-\x7e]/n
629          when 'fastq-illumina'
630            /[^\x40-\x7e]/n
631          else
632            /[ \x00-\x1f\x7f-\xff]/n
633          end
634     sc = StringScanner.new(@quality_string)
635     while sc.scan_until(re)
636       err.push Error::Qual_char.new(sc.pos - sc.matched_size)
637     end
638   end
639 
640   # if "errors" is given, set errors
641   errors.concat err if errors
642   # returns true if no error; otherwise, returns false
643   err.empty? ? true : false
644 end