eigenclass logo
MAIN  Index  Search  Changes  PageRank  Login

Changing filename encodings in a tarball, and a small refactoring tale

I had to transfer a few GB from a dying machine to an HFS+ partition; since I had the tarballs, it should have been a piece of cake, but:

 $ tar xvf bleh.tar 
 programaci\363n/
 tar: programaci\363n: Cannot mkdir: Invalid argument
 tar: Error exit delayed from previous errors 

The tarballs contain some ISO-8859_1-encoded filenames which are obviously not valid UTF8 streams*1.

So I wrote a script to change the filename encodings in a tarball to UTF8 on the fly. As happens often as of late*2, I had some 2-year-old code of mine lying around that solved most of the problem.

Here's the code (recode-tarball-utf8.rb) in case you also run into troubles with non-UTF8 filenames in your tarballs.

A small refactoring tale

De/encoding a tar header is quite repetitive by itself; a POSIX ("ustar") header looks like this:

struct tarfile_entry_posix {
  char name[100];     # ASCII + (Z unless filled)
  char mode[8];       # 0 padded, octal, null
  char uid[8];        # ditto
  char gid[8];        # ditto
  char size[12];      # 0 padded, octal, null
  char mtime[12];     # 0 padded, octal, null
  char checksum[8];   # 0 padded, octal, null, space
  char typeflag[1];   # file: "0"  dir: "5" 
  char linkname[100]; # ASCII + (Z unless filled)
  char magic[6];      # "ustar\0"
  char version[2];    # "00"
  char uname[32];     # ASCIIZ
  char gname[32];     # ASCIIZ
  char devmajor[8];   # 0 padded, octal, null
  char devminor[8];   # o padded, octal, null
  char prefix[155];   # ASCII + (Z unless filled)
};

The very description looks unDRY, and unsurprisingly the first versions of my tar reader code, written two years ago (and which crept into Archive:Tar:Minitar and RubyGems) were also very repetitious:

  FIELDS = [:name, :mode, :uid, :gid, :size, :mtime, :checksum, :typeflag,
          :linkname, :magic, :version, :uname, :gname, :devmajor, 
          :devminor, :prefix]
  FIELDS.each {|x| attr_accessor x}

  def self.new_from_stream(stream)
    data = stream.read(512)
    fields = data.unpack( "Z100" + # record name
                         "A8A8A8" +        # mode, uid, gid
                         "A12A12" +        # size, mtime
                         "A8a" +           # checksum, typeflag
                         "Z100" +          # linkname
                         "A6A2" +          # magic, version
                         "Z32" +           # uname
                         "Z32" +           # gname
                         "A8A8" +          # devmajor, devminor
                         "Z155"            # prefix
                        )
    name = fields.shift
    mode = fields.shift.oct
    uid = fields.shift.oct
    gid = fields.shift.oct
    size = fields.shift.oct
    mtime = fields.shift.oct
    checksum = fields.shift.oct
    typeflag = fields.shift
    linkname = fields.shift
    magic = fields.shift
    version = fields.shift
    uname = fields.shift
    gname = fields.shift
    devmajor = fields.shift.oct
    devminor = fields.shift.oct
    prefix = fields.shift

    empty = (data == "\0" * 512)
    
    new(:name=>name, :mode=>mode, :uid=>uid, :gid=>gid, :size=>size, 
        :mtime=>mtime, :checksum=>checksum, :typeflag=>typeflag, :magic=>magic,
        :version=>version, :uname=>uname, :gname=>gname, :devmajor=>devmajor,
        :devminor=>devminor, :prefix=>prefix, :empty => empty )
  end
    
  def initialize(vals)
    unless vals[:name] && vals[:size] && vals[:prefix] && vals[:mode]
      raise Package::ArgumentError
    end
    vals[:uid] ||= 0
    vals[:gid] ||= 0
    vals[:mtime] ||= 0
    vals[:checksum] ||= ""
    vals[:typeflag] ||= "0"
    vals[:magic] ||= "ustar"
    vals[:version] ||= "00"
    vals[:uname] ||= "wheel"
    vals[:gname] ||= "wheel"
    vals[:devmajor] ||= 0
    vals[:devminor] ||= 0
    FIELDS.each {|x| instance_variable_set "@#{x.to_s}", vals[x]}
    @empty = vals[:empty]
  end

We can do so much better:

  • why not put the information about the field type info in FIELDS...
  • all these fields.shift must go
  • ditto for the default values: they're awfully far from FIELDS

Ideally, the code should be driven by the data contained in FIELDS, with no redundancy. First attempt:

  def self.A(name, default = nil); [name, :ascii, default] end
  def self.OCT(name, default = nil); [name, :octal, default] end

  FIELDS = [A(:name), OCT(:mode), OCT(:uid, 0), OCT(:gid, 0), OCT(:size), 
            OCT(:mtime, 0), OCT(:checksum, ""), A(:typeflag, "0"), A(:linkname), 
            A(:magic, "ustar"), A(:version, "00"), A(:uname, "wheel"), 
            A(:gname, "wheel"), OCT(:devmajor, 0), OCT(:devminor, 0), A(:prefix)]
  FIELDS.each {|(x,_,_)| attr_accessor x}

  def self.new_from_stream(stream)
    data = stream.read(512)
    fields = data.unpack( "Z100" + # record name
                         "A8A8A8" +        # mode, uid, gid
                         "A12A12" +        # size, mtime
                         "A8a" +           # checksum, typeflag
                         "Z100" +          # linkname
                         "A6A2" +          # magic, version
                         "Z32" +           # uname
                         "Z32" +           # gname
                         "A8A8" +          # devmajor, devminor
                         "Z155"            # prefix
                        )
    empty = (data == "\0" * 512)
    opt_hash = {}
    FIELDS.zip(fields) do |(name, type, default), value|
      case type
      when :ascii
        opt_hash[name] = value
      when :octal
        opt_hash[name] = value.oct
      end
    end

    opt_hash[:empty] = empty
    new(opt_hash)
  end
    
  def initialize(vals)
    unless [:name, :size, :prefix, :mode].each{|x| vals.has_key?(x)}
      raise Package::ArgumentError
    end
   
    new_vals = vals.dup
    vals.each_pair{|name, value| new_vals[name] = value || FIELDS.assoc(name).last }

    FIELDS.each {|x,_,_| instance_variable_set "@#{x.to_s}", new_vals[x]}
    @empty = new_vals[:empty]
  end

With enough sugar, some would say that this is the basis of a data serialization DSL... I just call it redundancy encoding.

This looks much better, but the case type ... still feels wrong. Reminds me of the explicit switch statement vs. polymorphism, and the solution is very similar: let's turn that into a "polymorphic lambda call".

  def self.A(name, default = nil); [name, lambda{|x| x}, default] end
  def self.OCT(name, default = nil); [name, lambda{|x| x.oct}, default] end

  FIELDS = [A(:name), OCT(:mode), OCT(:uid, 0), OCT(:gid, 0), OCT(:size), 
            OCT(:mtime, 0), OCT(:checksum, ""), A(:typeflag, "0"), A(:linkname), 
            A(:magic, "ustar"), A(:version, "00"), A(:uname, "wheel"), 
            A(:gname, "wheel"), OCT(:devmajor, 0), OCT(:devminor, 0), A(:prefix)]
  FIELDS.each {|(x,_,_)| attr_accessor x}

  def self.new_from_stream(stream)
    data = stream.read(512)
    fields = data.unpack( "Z100" + # record name
                         "A8A8A8" +        # mode, uid, gid
                         "A12A12" +        # size, mtime
                         "A8a" +           # checksum, typeflag
                         "Z100" +          # linkname
                         "A6A2" +          # magic, version
                         "Z32" +           # uname
                         "Z32" +           # gname
                         "A8A8" +          # devmajor, devminor
                         "Z155"            # prefix
                        )
    empty = (data == "\0" * 512)
    opt_hash = {}
    FIELDS.zip(fields){|(name, type, default), value| opt_hash[name] = type[value] }

    opt_hash[:empty] = empty
    new(opt_hash)
  end

I could keep beating this dead horse, but anyway this was mostly a one-time thing, so I'll leave it there for today.


Last modified:2006/05/01 05:53:09
Keyword(s):[blog] [ruby] [frontpage] [tar] [filename] [encoding] [utf8] [refactor]
References:[Ruby]

*1 the luser that created this mess doesn't know about encodings, so...

*2 all I do is repeat myself? :(