#!/usr/bin/perl -w ############################################ # Ian M. Hayhurst 19 December 2003
# #
# # Make a fasta file from all the 'embl like' # # Derwent .DAT files in a directory
# #
# # no doubt there are prettier ways but I've # # had enough of this one...
# # and it's almost
Christmas ;-)
# # I'm Sure I borrowed an idea from David Starks-Browning here, # # only I cant remember what
# ############################################
$usage="Usage: derwent2fasta.pl [path to Derwent DAT files]\n"; @ARGV == 1ordie"$usage"; #print "do stuff with $ARGV[0]\n"; $local_incoming_directory = "$ARGV[0]"; $db_name ="AAGENESEQ"; chdir($local_incoming_directory); opendir(DIRECTORY, $local_incoming_directory) ordie"I Can't open the $local_incoming_directory directory.\n";
@dirdatfilelist = (readdir(DIRECTORY)) ; closedir DIRECTORY; #remove '.' and '..' from head of dirlisting to create array of datfiles @datfilelist @datfilelisttmp = splice(@dirdatfilelist, 2, $#dirdatfilelist); @datfilelist = sort@datfilelisttmp; open(FASTA , ">GENESEQP.fasta")ordie("I cant open your output file"); $ID=""; $AC=""; $DE=""; $buffer=""; foreach$element(@datfilelist){
chomp; if(/^ID/){$ID=$_; $ID=~ s/^ID\s+//; $ID=~ /(\w+)/; $ID=$1;}#Just the AC No in ID if(/^AC/){$AC=$_; $AC=~ s/^AC\s+//; $AC=~ s/;//;}#no semicolon thanks if(/^DE/ ... /^XX/){ if(/^XX/){next;} $DE=$_; $DE=~ s/^DE\s+//; $buffer = $buffer . $DE; next; } if(($IDne"") && ($ACne"") && ($DEne"")){#When you've got all three print em print FASTA">$db_name:$AC $ID $buffer\n"; $ID=""; #Reset Vars for next record $AC=""; $DE=""; $buffer=""; }
if(/^SQ/ ... /^\/\//){#get the lines that dont start with SQ if(/^\/\//){next;}#between SQ and // if(! /^SQ/){$seq=$_; $seq=~ s/[^A-Za-z]//g; #concat the spaces and loose the numerals print FASTA "$seq\n";}
}
} close(ELEMENT); #unlink
("$element");
#blow away the source DAT
file if you want }