#!/usr/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      @(#) DateRepeats
##  Author:
##      Arian Smit <asmit@systemsbiology.org>
##      Robert Hubley <rhubley@systemsbiology.org>
##  Description:
##      Labels the lineage specificity of repeats in RepeatMasker annotation
##      and creates a masked sequence for genomic alignment of related species
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2002-2004 Developed by
#* Arian Smit and Robert Hubley.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################
#
# ChangeLog
#
#   Updated to use FamDB instead of RepbaseEMBL/taxonomy.dat.
#
###############################################################################
#
# To Do:
#

=head1 NAME

DateRepeats - Annotate lineage of repeats.

=head1 SYNOPSIS

  DateRepeats <repeatmasker .out files> -query <species1>
              -comp <species2> [-comp <species3> -mask <species2>]

=head1 DESCRIPTION

DateRepeats will create a file similar to the .out file with added column(s)
indicating if a repeat is expected to be present in the indicated
'other species'.

Required Parameters:

=over 4

=item -q(uery) <species1>

The species that has been analyzed.

=item -c(omp) <species2>

Other mammalian species; can be used multiple times, adding extra
columns to the annotation in a  <file.out_species2_species3_etc>.

=back

Optional Parameters:

=over 4

=item -m(ask) <species>

Produces a sequence file with all lineage specific repeats masked, i.e.
those predicted to be in the -query and absent in the -mask species
(sequence <file> and RepeatMasker <file.out> must be in same directory)
<species> must correspond to one of the -comp species.

=item -a(ggressive)

Also mask those repeats unclear to be either lineage specific or ancestral.

=item -n(olow)

Does not mask satellites, simple repeats and low complexity DNA, none
of which $script is trying to label lineage-specific or ancestral.

=back

Species names are resolved via FamDB/NCBI Taxonomy.  Any species or clade
recognized by famdb.py may be used.

=head1 SEE ALSO

=over 4

RepeatMasker

=back

=head1 COPYRIGHT

Copyright 2003-2025 Arian Smit, Institute for Systems Biology

=head1 AUTHOR

Arian Smit <asmit@systemsbiology.org>

Robert Hubley <rhubley@systemsbiology.org>

=cut

#
# Module Dependence
#
#use strict;
use FindBin;
use lib $FindBin::RealBin;
use RepeatMaskerConfig;
use Getopt::Long;
use Taxonomy;

# A bug in 5.8 produces way too many warnings
if ( $] && $] >= 5.008003 ) {
  use warnings;
}

my $FAMDB_DIR = $RepeatMaskerConfig::configuration->{'FAMDB_DIR'}->{'value'};

die "FamDB is required for DateRepeats but is not configured.\n"
    . "Please re-run the RepeatMasker configure script to set up FamDB.\n"
    unless ( $FAMDB_DIR ne "" && -x "$FAMDB_DIR/famdb.py" );

my ( $script ) = ( $0 =~ m|([^/]*)$| );
my $usage = "\nUsage: $script <repeatmasker .out files> -query <species1>
              -comp <species2> [-comp <species3> -mask <species2>]

$script will create a file similar to the .out file with added column(s)
indicating if a repeat is expected to be present in the indicated \'other
species\'.

Required:
-q -query <species1> The species that has been analyzed.
-c -comp <species2>  Other mammalian species; can be used multiple times,
                     adding extra columns to the annotation in a
                     <file.out_species2_species3_etc>

Optional parameters:
-m -mask <species> Produces a sequence file with all lineage specific
                   repeats masked, i.e. those predicted to be in the
                   -query and absent in the -mask species (sequence <file>
                   and RepeatMasker <file.out> must be in same directory)
                   <species> must correspond to one of the -comp species.
-a -aggressive     Also mask those repeats unclear to be lineage specific
                   or ancestral.
-n -nolow          Does not mask satellites, simple repeats and low
                   complexity DNA, none of which $script is trying to
                   label lineage-specific or ancestral.
-h -help           Print out the daterepeats help manual.

Species names are resolved via FamDB/NCBI Taxonomy.  Any species or
clade recognized by famdb.py may be used (e.g. \"homo sapiens\",
\"mus musculus\", \"rodentia\", etc.).
\n";

## currently the following are global parameters
my @spec = ();
my @opts =
    ( "query=s", "comp=s" => \@spec, "mask=s", "aggressive", "nolow", "help" );
$opt_query = $opt_mask = $opt_aggressive = $opt_nolow = $opt_help;
@spec = ();    # global; used in all subroutines

unless ( &GetOptions( @opts ) ) {
  print STDERR "$usage";
  exit( 1 );
}

# Print out the big help file if requested
my $PAGER = $ENV{PAGER};
$PAGER = "more" if !defined $PAGER;
system( "$PAGER $FindBin::Bin/daterepeats.help\n" ), exit if $opt_help;

die "$usage" unless $opt_query && $spec[ 0 ];

@pat = ();    #global
my $match;

# Open the taxonomy database (backed by FamDB)
my $tax = Taxonomy->new();

# Standardize the species names passed to us
my $query = $tax->isSpecies( $opt_query );
die "Do not know of species/clade: $opt_query!\n" if ( !defined $query || $query eq "" );
my $mask  = "";
if ( $opt_mask ) {
  $mask = $tax->isSpecies( $opt_mask );
  die "Do not know of species/clade: $opt_mask!\n" if ( !defined $mask || $mask eq "" );
}
for $i ( 0 .. $#spec ) {
  my $normalizedSpecies = $tax->isSpecies( $spec[ $i ] );
  die "Do not know of species/clade: $spec[$i]!\n"
      if ( !defined $normalizedSpecies || $normalizedSpecies eq "" );
  $spec[ $i ] = $normalizedSpecies;
  $match = $i if $mask && $spec[ $i ] eq $mask;
  die "Query and comparison species must be different\n\n"
      if ( $query eq $spec[ $i ] );
}

die "Query and -mask species must be different\n\n"
    if ( $opt_mask && $query eq $mask );

die "The -mask species has to be the same as one of the -comp species\n\n"
    if ( $opt_mask && !defined $match );

foreach $rmfile ( @ARGV ) {
  my $seqfile = $rmfile;
  $seqfile =~ s/\.out$//;
  die "$script only accepts RepeatMasker output files ending in .out "
      . "(unlike \"$rmfile\")\n\n"
      if ( $rmfile !~ /\.out$/ );
  unless ( !$opt_mask || -s $seqfile ) {
    die "No sequence file $seqfile is found in the same directory "
        . "as $rmfile to create a lineage-specifically masked "
        . "sequence file\n\n";
  }
}

&GetInfo( $tax, $query, \@spec );

foreach $rmfile ( @ARGV ) {
  my ( %begin, %end, %class, @lines );
  %pattern = ();    # global; associates an 'age-pattern' with each repeat ID
  my ( $maxID, $adjustment, $noidea ) = ( 0, 0, 0 );
  open IN, "<$rmfile" or die "Could not open $rmfile!\n";
  my $outfile = "$rmfile";
  foreach my $spec ( @spec ) {
    $outfile .= "_$spec";
  }
  $outfile =~ s/\s+/-/g;
  die "Output file is input file. $script will not override $rmfile.\n"
      if ( $outfile eq $rmfile );

  open OUT, ">$outfile" or die "Could not open $outfile for writing!\n";
  print "Processing $rmfile\n  Writing annotated output to: $outfile\n";
  my @bit         = ();
  my @correctedID = ();
  my $lineNum     = 0;
  while ( <IN> ) {
    chomp;
    if ( /^   SW  perc/ ) {
      $_ .= "     ";    # give it some space (ID is underneath)
      foreach my $spec ( @spec ) {
        $_ .= " $spec";
      }
    }
    elsif ( /\(\d+\)/ ) {
      $_ .= " ";        # in case there is no space at end
           # Get rid of overlap warnings and provide spacing for new column
      s/\s+\*?\s*$/    /;
      @bit = split;
      if ( $bit[ 14 ] ) {    # for the strangers who delete the IDs
        if (

          # Sometimes long sequences have been analyzed in chunks
          # (e.g. at UCSC analyses chunks of 500kb)
          # in those situtations the IDs start from 1 again; drop is
          # almost always > 100, so 50 should be catch most if not all
          # 50 is also high enough; never caught a repeat interrupted
          # by 50 inserted other repeats.
          # (A bug introduced in ProcessRepeats version 1.42 does
          # produce such results; these are fixed in version 1.80,
          # but similar bugs can be reintroduced; thus, an extra
          # requirement is now that the ID < 10.
          $bit[ 14 ] < 10 &&

          # All segments should start with 1 so this may be set
          # stricter. However, it is possible that, unlike at the
          # UCSC browser, people use overlapping segments and the
          # beginning of a segment may be broken of
          $bit[ 14 ] + $adjustment < $maxID - 20 &&

          # some segments may have few repeats when a very long
          # N-patch is present (e.g. centromere)
          (
               $bit[ 14 ] + $adjustment < $maxID - 50
            || $bit[ 10 ] ne $class{ $bit[ 14 ] }
          )
            )
        {

          $adjustment = ( $maxID - $bit[ 14 ] + 10 );
        }
        $bit[ 14 ] += $adjustment;
        $class{ $bit[ 14 ] } = $bit[ 10 ];
      }
      else {
        $noidea = 1;
      }
      my $patternstring;

      # Some repeat names do not appear in the repeatmasker database
      # The name changes here are invisible outside the script; they
      # only help to improve the age estimates. This code should be
      # replaced by changes in the database

      # There can't be any case-ambiguity
      $bit[ 9 ] =~ tr/a-z/A-Z/;

      # PR ambiguates some Alus (and perhaps others) by naming the
      # repeat e.g. AluSq/x AluJ/F(R)AM
      if ( $bit[ 10 ] eq 'SINE/Alu' ) {
        $bit[ 9 ] =~ s/\/\S+$//;

        # 20 CpG pairs drives up the mismatch level improperly
        $bit[ 1 ] -= 5;
        $bit[ 1 ] == 0 if $bit[ 1 ] < 0;
      }
      elsif ( $bit[ 10 ] =~ /^DNA/ ) {

        # PR throws in some alternative names like Tigger3(Golem),
        $bit[ 9 ] =~ s/\(\S+$//;
        $bit[ 9 ] =~ s/^MER7([A-Z])$/TIGGER3$1/;
        $bit[ 9 ] =~ s/^TIGGER7([A-Z])$/MER44$1/;
        $bit[ 9 ] =~ s/^TIGGER5([A-Z])$/MER47$1/;
      }
      elsif ( $bit[ 10 ] eq 'LINE/L1' ) {
        if ( $bit[ 9 ] =~ /^L1M\w$/ ) {
          $bit[ 9 ] =~ s/^L1M[E5]$/L1ME2/;
          $bit[ 9 ] =~ s/^L1M[CD]$/L1MC2/;
          $bit[ 9 ] =~ s/^L1M4$/L1MB7/;
          $bit[ 9 ] =~ s/^L1M3$/L1MB2/;
          $bit[ 9 ] =~ s/^L1M2$/L1MA7/;
          $bit[ 9 ] =~ s/^L1M1$/L1MA2/;
        }
        elsif ( $bit[ 9 ] =~ /^L1P\w?$/ ) {
          $bit[ 9 ] =~ s/^L1P1$/L1PA2/;
          $bit[ 9 ] =~ s/^L1P2$/L1PA5/;
          $bit[ 9 ] =~ s/^L1P3$/L1PA8/;
          $bit[ 9 ] =~ s/^L1P4$/L1PA14/;
          $bit[ 9 ] =~ s/^L1P[5B]$/L1PB2/;
          $bit[ 9 ] =~ s/^L1P$/L1PA10/;
        }
        $bit[ 9 ] =~ s/^L1MC\/D/L1MC3/;
      }
      elsif ( $bit[ 10 ] eq 'LTR/ERVL' ) {

# ERVL internal sequences named after MLT2 subfamilies. Primate specific one have HERVL.
        $bit[ 9 ] =~ s/^ERVL-.+/ERVL/;
        $bit[ 9 ] =~ s/^HERVL-.+/HERVL/;
      }

      # Check if shared{} exist (exists if the sequence name appears
      # in the database) If not: base phylogenetic labeling on
      # divergence level

      if ( $bit[ 10 ] =~ /^Simple|^Low|^Satel/ ) {

        # Nothing sensible to say about these
        for $i ( 0 .. $#spec ) {
          $patternstring .= "-  ";
        }
      }
      elsif ( $bit[ 14 ] && $shared{ $bit[ 9 ] } && $bit[ 10 ] !~ /RNA$/ ) {

        # RNA genes have produced copies "forever"
        $patternstring =
            &AdjustSharedInfo( $bit[ 1 ], $bit[ 14 ], $shared{ $bit[ 9 ] } );
        $pattern{ $bit[ 14 ] } = $patternstring;
      }
      else {
        for $i ( 0 .. $#spec ) {
          if ( 5 * $div{ $spec[ $i ] } / 4 < $bit[ 1 ] ) {
            $patternstring .= "X  ";
          }
          elsif ( 3 * $div{ $spec[ $i ] } / 4 > $bit[ 1 ] ) {
            $patternstring .= "0  ";
          }
          else {
            $patternstring .= "?  ";
          }
        }
      }
      $_ .= $patternstring;
      $maxID = $bit[ 14 ] if $bit[ 14 ] && $bit[ 14 ] > $maxID;
    }
    $correctedID[ $lineNum++ ] = $bit[ 14 ];
    push @lines, "$_";
  }
  close IN;
  &windback( \@lines, \@correctedID ) unless $noidea;
  foreach $_ ( @lines ) {
    print OUT "$_\n";
    if ( $opt_mask && /\(\d+\)/ ) {
      my @bits = split;
      my @pat  = @bits[ 15 .. $#bits ];
      my $pat  = $pat[ $match ];
      if (    $pat eq '0'
           || $pat eq '?' && $opt_aggressive
           || $pat eq '-' && !$opt_nolow )
      {
        push @{ $begin{ $bits[ 4 ] } },
            $bits[ 5 ];    #makes your nose shrivel, doesn't it
        push @{ $end{ $bits[ 4 ] } }, $bits[ 6 ];
      }
    }
  }
  close OUT;
  &MaskLinSpec( \%begin, \%end ) if $opt_mask;
}

##-------------------------------------------------------------------------##
sub windback {
  my $linesRef       = shift;
  my $correctedIDRef = shift;

  my $i = $#{$linesRef};
  while ( $i >= 0 ) {
    my $xandos = "";
    if ( $linesRef->[ $i ] =~ /\(\d+\)/ ) {
      my @bits = split( /\s+/, $linesRef->[ $i ] );
      shift @bits unless $bits[ 0 ] =~ /\d/;   # morose splitting of first space
      if ( $pattern{ $correctedIDRef->[ $i ] } ) {
        my $pattern = quotemeta $pattern{ $correctedIDRef->[ $i ] };
        if ( $linesRef->[ $i ] !~ /$pattern/ ) {
          my @xandos = &MakeLikeOldPattern( $bits[ 1 ],
                                            $correctedIDRef->[ $i ],
                                            @bits[ 15, 16, 17, 18, 19 ] )
              ;    # don't expect more than 5 species
          for $j ( 0 .. $#spec ) {
            $xandos .= "$xandos[$j]  ";
          }
          $linesRef->[ $i ] =~
              s/(.+$bits[13]\s+$bits[14]\s+)([\sX0\?\-]+$)/$1$xandos/;
        }
      }
      $linesRef->[ $i ] =~ /(.+$bits[13]\s+$bits[14]\s+)([\sX0\?\-]+)$/;
      $pattern{ $correctedIDRef->[ $i ] } = $2;
    }
    --$i;
  }
}

##-------------------------------------------------------------------------##
sub GetInfo {
  my $taxonomyDB     = shift;
  my $query          = shift;
  my $compSpeciesRef = shift;

  &CollectInfo( $taxonomyDB, $query, $compSpeciesRef );

  if (    $taxonomyDB->isA( $query, "homo" ) > 0
       || $taxonomyDB->isA( $query, "pan" ) > 0 )
  {
    for $i ( 0 .. $#spec ) {
      if ( $spec[ $i ] =~ /^homo[\s\S]|^pan[\s\S]/ ) {
        $div{ $spec[ $i ] } = 1;
      }
      elsif ( $spec[ $i ] =~ /^hylobates[\s\S]/ ) {
        $div{ $spec[ $i ] } = 2;
      }
      elsif ( $spec[ $i ] =~ /^macaca[\s\S]/ ) {
        $div{ $spec[ $i ] } = 3;
      }
      else {    # temporarily only for other orders
                # 15.5% is higher than theoretically expected; empirically
                # best approximate though
        $div{ $spec[ $i ] } = 15.5;
      }
    }
  }
  elsif (    $taxonomyDB->isA( $query, "mus" ) > 0
          || $taxonomyDB->isA( $query, "rattus" ) > 0 )
  {
    for $i ( 0 .. $#spec ) {
      if ( $spec[ $i ] =~ /^mus musculus$|^rattus/ ) {
        $div{ $spec[ $i ] } = 10;
      }
      else {
        $div{ $spec[ $i ] } = 28;
      }
    }
  }
  elsif (    $taxonomyDB->isA( $query, "canis" ) > 0
          || $taxonomyDB->isA( $query, "felis" ) > 0 )
  {
    for $i ( 0 .. $#spec ) {
      if ( $spec[ $i ] =~ /^felis|^canis/ ) {
        $div{ $spec[ $i ] } = 10;
      }
      else {
        $div{ $spec[ $i ] } = 18;
      }
    }
  }
  elsif (    $taxonomyDB->isA( $query, "bos" ) > 0
          || $taxonomyDB->isA( $query, "sus" ) > 0 )
  {
    for $i ( 0 .. $#spec ) {
      if ( $spec[ $i ] =~ /^bos |^sus / ) {
        $div{ $spec[ $i ] } = 12;
      }
      else {
        $div{ $spec[ $i ] } = 18;
      }
    }
  }
  elsif ( $taxonomyDB->isA( $query, "equus" ) > 0 ) {
    for $i ( 0 .. $#spec ) {
      $div{ $spec[ $i ] } = 14;    # apparently slow rate
    }
  }
  elsif ( $taxonomyDB->isA( $query, "lagomorpha" ) > 0 ) {
    for $i ( 0 .. $#spec ) {
      $div{ $spec[ $i ] } = 30;    # worse than rodents
    }
  }
}

##-------------------------------------------------------------------------##
##  CollectInfo - Build %shared hash via FamDB queries.
##
##  For each comparison species, the common ancestor with the query is
##  found.  All families annotated at or before that ancestor are
##  considered "shared" (X).  Families in the query lineage but not
##  in the shared set are "lineage-specific" (0).
##-------------------------------------------------------------------------##
sub CollectInfo {
  my $tax     = shift;
  my $query   = shift;
  my $compRef = shift;

  print "Querying FamDB for lineage information...\n";

  my @queryLineage = $tax->getLineage( $query );
  die "Could not retrieve lineage for '$query' from FamDB.\n"
      unless @queryLineage;

  # For each comp species, pre-compute the shared (ancestral) family name set
  my @ancestralSets = ();
  for my $j ( 0 .. $#{$compRef} ) {
    my $comp = $compRef->[ $j ];
    my @compLineage = $tax->getLineage( $comp );
    die "Could not retrieve lineage for '$comp' from FamDB.\n"
        unless @compLineage;

    my $commonAnc = _findCommonAncestor( \@queryLineage, \@compLineage );
    print "  Common ancestor of '$query' and '$comp': $commonAnc\n";

    my %ancestral = ();
    _collectFamilyNames( $commonAnc, [ "--ancestors" ], \%ancestral );
    printf "    %d shared (ancestral) families found.\n", scalar keys %ancestral;
    push @ancestralSets, \%ancestral;
  }

  # Collect all families in the query lineage (what RM would use to mask)
  my %queryFamilies = ();
  _collectFamilyNames( $query, [ "--ancestors", "--descendants" ], \%queryFamilies );
  printf "  %d total families found for '%s'.\n\n", scalar keys %queryFamilies, $query;

  # Build %shared: for each query family, record X/0 per comparison species
  foreach my $name ( keys %queryFamilies ) {
    my $sharedStr = "";
    for my $j ( 0 .. $#{$compRef} ) {
      $sharedStr .= ( exists $ancestralSets[ $j ]{ $name } ) ? "X  " : "0  ";
    }
    $shared{ $name } = $sharedStr;

    # Store short-name variants that RepeatMasker may use in .out files

    # _3END suffix stripped
    if ( $name =~ /_3END/ ) {
      ( my $short = $name ) =~ s/_3END.*$//;
      $shared{ $short } = $sharedStr unless $shared{ $short };
    }

    # Alu: strip trailing subfamily letter/digit
    if ( $name =~ /^ALU/ ) {
      ( my $short = $name ) =~ s/[A-Z1-9]$//;
      $shared{ $short } = $sharedStr unless $shared{ $short };
    }
    # DNA transposons: strip trailing letter for subfamily variants
    elsif ( $name =~ /^(CHARLIE|TIGGER|MER\d|MARINER|HSMAR)/ ) {
      ( my $short = $name ) =~ s/[A-Z]$//;
      $shared{ $short } = $sharedStr unless $shared{ $short };
    }
    # LTR elements: also record the -INT internal variant name
    elsif ( $name =~ /LTR|ERV|HERV|MLT|THE\d/ ) {
      my $long = $name . "-INT";
      $shared{ $long } = $sharedStr unless $shared{ $long };
    }
  }
}

##-------------------------------------------------------------------------##
##  _findCommonAncestor
##
##  Given two lineage arrays (root-first order, as returned by
##  Taxonomy::getLineage), return the name of the most derived shared node.
##-------------------------------------------------------------------------##
sub _findCommonAncestor {
  my $lineage1 = shift;
  my $lineage2 = shift;

  my $commonAnc = $lineage1->[ 0 ];    # "root" at minimum
  my $maxIdx = ( $#{ $lineage1 } < $#{ $lineage2 } )
               ? $#{ $lineage1 } : $#{ $lineage2 };
  for my $i ( 0 .. $maxIdx ) {
    last if lc( $lineage1->[ $i ] ) ne lc( $lineage2->[ $i ] );
    $commonAnc = $lineage1->[ $i ];
  }
  return $commonAnc;
}

##-------------------------------------------------------------------------##
##  _collectFamilyNames
##
##  Query FamDB for curated family names under a given clade and populate
##  a hash (keys = uppercase family names).
##
##  $flags is an arrayref of extra famdb.py flags, e.g. ["--ancestors"] or
##  ["--ancestors", "--descendants"].
##-------------------------------------------------------------------------##
sub _collectFamilyNames {
  my $clade    = shift;
  my $flags    = shift;
  my $namesRef = shift;

  my $flagStr  = join( " ", @{ $flags } );
  my $famdbCmd = "$FAMDB_DIR/famdb.py families '$clade' --curated $flagStr -f fasta_name";

  open my $fh, "$famdbCmd |"
      or die "Could not run famdb.py: $famdbCmd\n";
  while ( <$fh> ) {
    # FASTA header lines look like:  >FamilyName#Type/SubType
    if ( /^>(\S+?)(?:#|\s|$)/ ) {
      $namesRef->{ uc( $1 ) } = 1;
    }
  }
  close $fh;
  die "famdb.py returned an error for: $famdbCmd\n" if $?;
}

##-------------------------------------------------------------------------##
sub AdjustSharedInfo {
  my $div        = shift;
  my $id         = shift;
  my $oldXsandOs = shift;
  my @xs         = split( /  /, $oldXsandOs );
  my $xsandos    = "";
  for $i ( 0 .. $#spec ) {
    if (

      # labeled as ancestral but low divergence suggests lineage specificity
      $xs[ $i ] eq 'X'
      && $div{ $spec[ $i ] } / 2 >= $div
      && $div{ $spec[ $i ] } - 1 >= $div

      # only effects close species; e.g. for chimp-human (0.6%
      # div at split, but 1% taken), even perfect matches to
      # a consensus does not suggest that a shared status is wrong
      # (this depends on length of match and CpG #; perhaps can
      # be worked in?)

      # labeled as lineage-specific, but high level of
      # divergence suggests ancestral status
      || $xs[ $i ] eq '0'
      && 3 * $div{ $spec[ $i ] } / 2 <= $div
      && $div{ $spec[ $i ] } + 3 <= $div

      # imperfections in the consensus and subfamily differences
      # can easily amount to an extra 3% div
        )
    {

      # divergence levels too erratic to assign age with confidence
      # totally arbitrary difference of half and 8% divergence
      # perhaps there is a statistically valid way to do it.
      $xsandos .= "?  ";
    }
    else {
      $xsandos .= "$xs[$i]  ";
    }
  }
  if ( $pattern{$id} && $pattern{$id} ne $xsandos ) {
    $xsandos = "";
    @xs = &MakeLikeOldPattern( $div, $id, @xs );
    for $i ( 0 .. $#spec ) {
      $xsandos .= "$xs[$i]  ";
    }
  }
  return $xsandos;
}

##-------------------------------------------------------------------------##
sub MakeLikeOldPattern {
  my $div    = shift;
  my $id     = shift;
  my @xs     = @_;
  my @oldpat = split( /\s+/, $pattern{$id} );
  for $i ( 0 .. $#spec ) {
    if ( $xs[ $i ] ne $oldpat[ $i ] && $oldpat[ $i ] ne '?' ) {
      if (    $xs[ $i ] eq '?'
           || $xs[ $i ] eq '0' && $div > $div{ $spec[ $i ] }
           || $xs[ $i ] eq 'X' && $div < $div{ $spec[ $i ] } )
      {
        $xs[ $i ] = $oldpat[ $i ];
      }
    }
  }
  return @xs;
}

##-------------------------------------------------------------------------##
sub MaskLinSpec {
  my ( $beginRef, $endRef ) = @_;
  my ( $seq, $name );
  my ( %seqwithname, %description, @seqname_array ) = ();
  my $seqfile = $rmfile;
  $seqfile =~ s/\.out$//;
  open F, $seqfile;
  while ( <F> ) {
    chomp;
    if ( /^\s*>\s*(\S*)/ ) {
      if ( $seqwithname{$name} && $seqwithname{$name} ne $seq ) {
        die "No special masked file was created as there are two different
sequences with the name \"$name\" in the file\"$seqfile\"\n";
      }
      $seqwithname{$name} = $seq if $seq;
      $name               = $1;
      $description{$name} = $_;
      push( @seqname_array, $name );
      $seq = '';
    }
    else {
      $seq .= $_;
    }
  }
  if ( $seqwithname{$name} && $seqwithname{$name} ne $seq ) {
    die "No special masked file was created as there are two different
sequences with the name \"$name\" in the file\"$seqfile\"\n";
  }
  $seqwithname{$name} = $seq if $seq;
  close F;
  my $outfile = "$seqfile.masked" . "_vs_$opt_mask";
  open( MASKOUT, ">$outfile" );
  my $naam;
  foreach $naam ( @seqname_array ) {
    my $XedSeq = "";
    if ( defined $seqwithname{$naam} )
    {    # exception when a fasta line is not followed by sequence
      $XedSeq =
          &XoutSeq( $seqwithname{$naam}, $beginRef->{$naam}, $endRef->{$naam} );
    }
    print MASKOUT "$description{$naam}\n";
    my $len = length $XedSeq;
    my $i   = 0;
    while ( $i < $len - 50 ) {
      print MASKOUT substr( $XedSeq, $i, 50 ), "\n";
      $i += 50;
    }
    if ( $i < $len ) {
      print MASKOUT substr( $XedSeq, $i ), "\n";
    }
  }
}

##-------------------------------------------------------------------------##
sub XoutSeq {
  my ( $XedSeq, $beginRef, $endRef ) = @_;
  if ( $beginRef ) {
    my $i;
    for ( $i = $#$beginRef ; $i > -1 ; $i-- ) {
      my $len = $$endRef[ $i ] - $$beginRef[ $i ] + 1;
      substr( $XedSeq, $$beginRef[ $i ] - 1, $len ) = 'N' x $len;
    }
  }
  return $XedSeq;
}

1;
