class Bio::SOFT

bio/db/soft.rb - Interface for SOFT formatted files

Author

Trevor Wennblom <trevor@corevx.com>

Copyright

Copyright © 2007 Midwinter Laboratories, LLC (midwinterlabs.com)

License

The Ruby License

Description

“SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based, ASCII text format that incorporates experimental data and metadata.” – GEO, National Center for Biotechnology Information

The Bio::SOFT module reads SOFT Series or Platform formatted files that contain information describing one database, one series, one platform, and many samples (GEO accessions). The data from the file can then be viewed with Ruby methods.

Bio::SOFT also supports the reading of SOFT DataSet files which contain one database, one dataset, and many subsets.

Format specification is located here:

SOFT data files may be directly downloaded here:

NCBI’s Gene Expression Omnibus (GEO) is here:

Usage

If an attribute has more than one value then the values are stored in an Array of String objects. Otherwise the attribute is stored as a String.

The platform and each sample may contain a table of data. A dataset from a DataSet file may also contain a table.

Attributes are dynamically created based on the data in the file. Predefined keys have not been created in advance due to the variability of SOFT files in-the-wild.

Keys are generally stored as Symbols. In the case of keys for samples and table headings may alternatively be accessed with Strings. The names of samples (geo accessions) are case sensitive. Table headers are case insensitive.

require 'bio'

lines = IO.readlines('GSE3457_family.soft') 
soft = Bio::SOFT.new(lines)

soft.platform[:geo_accession]             # => "GPL2092"
soft.platform[:organism]                  # => "Populus"
soft.platform[:contributor]               # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"]
soft.platform[:data_row_count]            # => "240"
soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country]
soft.platform[:"contact_zip/postal_code"] # => "97331"
soft.platform[:table].header              # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"]
soft.platform[:table].header_description  # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"}
soft.platform[:table].rows.size           # => 240
soft.platform[:table].rows[5]             # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"]
soft.platform[:table].rows[5][4]          # => "P. tremula x P. tremuloides"
soft.platform[:table].rows[5][:organism]  # => "P. tremula x P. tremuloides"
soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides"

soft.series[:geo_accession]               # => "GSE3457"
soft.series[:contributor]                 # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"]
soft.series[:platform_id]                 # => "GPL2092"
soft.series[:sample_id].size              # => 74
soft.series[:sample_id][0..4]             # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"]

soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
soft.database[:institute]                 # => "NCBI NLM NIH"

soft.samples.size                         # => 74
soft.samples[:GSM77600][:series_id]       # => "GSE3457"
soft.samples['GSM77600'][:series_id]      # => "GSE3457"
soft.samples[:GSM77600][:platform_id]     # => "GPL2092"
soft.samples[:GSM77600][:type]            # => "RNA"
soft.samples[:GSM77600][:title]           # => "jst2b2"
soft.samples[:GSM77600][:table].header    # => ["ID_REF", "VALUE"]
soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"}
soft.samples[:GSM77600][:table].rows.size # => 217
soft.samples[:GSM77600][:table].rows[5]   # => ["A039P68U", "8.19"]
soft.samples[:GSM77600][:table].rows[5][0]        # => "A039P68U"
soft.samples[:GSM77600][:table].rows[5][:id_ref]  # => "A039P68U"
soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U"

lines = IO.readlines('GDS100.soft') 
soft = Bio::SOFT.new(lines)

soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
soft.database[:institute]                 # => "NCBI NLM NIH"

soft.subsets.size                         # => 8
soft.subsets.keys                         # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"]
soft.subsets[:GDS100_7]                   # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"}
soft.subsets['GDS100_7'][:sample_id]      # => "GSM548,GSM543"
soft.subsets[:GDS100_7][:sample_id]       # => "GSM548,GSM543"
soft.subsets[:GDS100_7][:dataset_id]      # => "GDS100"

soft.dataset[:order]                      # => "none"
soft.dataset[:sample_organism]            # => "Escherichia coli"
soft.dataset[:table].header               # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"]
soft.dataset[:table].rows.size            # => 5764
soft.dataset[:table].rows[5]              # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"]
soft.dataset[:table].rows[5][4]           # => "0.242"
soft.dataset[:table].rows[5][:gsm549]     # => "0.097"
soft.dataset[:table].rows[5][:GSM549]     # => "0.097"
soft.dataset[:table].rows[5]['GSM549']    # => "0.097"

Constants

LINE_TYPE_ENTITY_ATTRIBUTE
LINE_TYPE_ENTITY_INDICATOR
LINE_TYPE_TABLE_HEADER
TABLE_COLUMN_DELIMITER

data table row defined by absence of line type character

Attributes

database[RW]
dataset[RW]
platform[RW]
samples[RW]
series[RW]
subsets[RW]

Public Class Methods

new(lines=nil) click to toggle source

Constructor


Arguments

  • lines: (required) contents of SOFT formatted file

Returns

Bio::SOFT

    # File lib/bio/db/soft.rb
147 def initialize(lines=nil)
148   @database = Database.new
149   
150   @series = Series.new
151   @platform = Platform.new
152   @samples = Samples.new
153   
154   @dataset = Dataset.new
155   @subsets = Subsets.new
156   
157   process(lines)
158 end

Protected Instance Methods

custom_raise( line_number_with_0_based_indexing, msg ) click to toggle source
    # File lib/bio/db/soft.rb
381 def custom_raise( line_number_with_0_based_indexing, msg )
382   raise ["Error processing input line: #{line_number_with_0_based_indexing+1}",
383     msg].join("\t")
384 end
error_msg( i, extra_info=nil ) click to toggle source
    # File lib/bio/db/soft.rb
354 def error_msg( i, extra_info=nil )
355   case i
356   when 10
357     x = ["Lines without line-type characters are rows in a table, but",
358     "a line containing an entity indicator such as",
359     "\"#{LINE_TYPE_ENTITY_INDICATOR}SAMPLE\",",
360     "\"#{LINE_TYPE_ENTITY_INDICATOR}PLATFORM\",",
361     "or \"#{LINE_TYPE_ENTITY_INDICATOR}DATASET\" has not been",
362     "previously encountered or it does not appear that this line is",
363     "in a table."]
364   when 20
365     # tables are allowed inside samples and platforms
366     x = ["Tables are only allowed inside SAMPLE and PLATFORM.",
367       "Current table information found inside #{extra_info}."]
368   when 30
369     x = ["Entity attribute line (\"#{LINE_TYPE_ENTITY_ATTRIBUTE}\")",
370       "found before entity indicator line (\"#{LINE_TYPE_ENTITY_INDICATOR}\")"]
371   when 40
372     x = ["Unkown entity indicator.  Must be DATABASE, SAMPLE, PLATFORM,",
373       "SERIES, DATASET, or SUBSET."]
374   else
375     raise IndexError, "Unknown error message requested."
376   end
377   
378   x.join(" ")
379 end
process(lines) click to toggle source
    # File lib/bio/db/soft.rb
272 def process(lines)
273   current_indicator = nil
274   current_class_accessor = nil
275   in_table = false
276       
277   lines.each_with_index do |line, line_number|
278     line.strip!
279     next if line.nil? or line.empty?
280     case line[0].chr
281     when LINE_TYPE_ENTITY_INDICATOR
282       current_indicator, value = split_label_value_in( line[1..-1] )
283 
284       case current_indicator
285       when 'DATABASE'
286         current_class_accessor = @database
287       when 'DATASET'
288         current_class_accessor = @dataset
289       when 'PLATFORM'
290         current_class_accessor = @platform
291       when 'SERIES'
292         current_class_accessor = @series
293       when 'SAMPLE'
294         @samples[value] = Sample.new
295         current_class_accessor = @samples[value]
296       when 'SUBSET'
297         @subsets[value] = Subset.new
298         current_class_accessor = @subsets[value]
299       else
300         custom_raise( line_number, error_msg(40, line) )
301       end
302         
303     when LINE_TYPE_ENTITY_ATTRIBUTE
304       if( current_indicator == nil )
305         custom_raise( line_number, error_msg(30) )
306       end
307       
308       # Handle lines such as '!platform_table_begin' and '!platform_table_end'
309       if in_table
310         if line =~ %r{table_begin}
311           next
312         elsif line =~ %r{table_end}
313           in_table = false
314           next
315         end
316       end
317       
318       key, value = split_label_value_in( line, true )
319       key_s = key.to_sym
320       
321       if current_class_accessor.include?( key_s )
322         if current_class_accessor[ key_s ].class != Array
323           current_class_accessor[ key_s ] = [ current_class_accessor[ key_s ] ]
324         end
325         current_class_accessor[key.to_sym] << value
326       else
327         current_class_accessor[key.to_sym] = value
328       end
329       
330     when LINE_TYPE_TABLE_HEADER
331       if( (current_indicator != 'SAMPLE') and (current_indicator != 'PLATFORM') and (current_indicator != 'DATASET') )
332         custom_raise( line_number, error_msg(20, current_indicator.inspect) )
333       end
334       
335       in_table = true   # may be redundant, computationally not worth checking
336 
337       # We only expect one table per platform or sample
338       current_class_accessor[:table] ||= Table.new
339       key, value = split_label_value_in( line )
340       # key[1..-1] -- Remove first character which is the LINE_TYPE_TABLE_HEADER
341       current_class_accessor[:table].header_description[ key[1..-1] ] = value
342       
343     else
344       # Type: No line type - should be a row in a table.
345       
346       if( (current_indicator == nil) or (in_table == false) )
347         custom_raise( line_number, error_msg(10) )
348       end
349       current_class_accessor[:table].add_header_or_row( line )
350     end
351   end
352 end
split_label_value_in( line, shift_key=false ) click to toggle source
    # File lib/bio/db/soft.rb
386 def split_label_value_in( line, shift_key=false )
387   line =~ %r{\s*=\s*}
388   key, value = $`, $'
389   
390   if shift_key
391     key =~ %r{_}
392     key = $'
393   end
394   
395   if( (key == nil) or (value == nil) )
396     puts line.inspect
397     raise
398   end
399   
400   [key, value]
401 end