#! /usr/bin/perl -w

#this should take a file containing the affymetrix annotation information for a chip and a GO gene-association file (many different ones are available and output a file containing an affy-identifier to GO-number lookup (as readable by clans). I cannot provide a script that converts all of the files, as each of the files changes slightly in format (f.example: The names in the Yeast affymetrix file have an S1 appended to them (for whatever reason) making it impossible to use that exact name for searching through the gene-ontology files. Similarly, the Zebrafish datasets from Gene-ontology and Affymetrix do not provide me with a shared name, making the mapping of the affymetrix identifier to the GO-terms challenging. You will therefore have to edit this script to make it work for your pet organism, or write your own.
#If you do NOT have the input-files open next to you while you are attempting to edit this script I fear you will not be able to follow the logic.

if(scalar @ARGV <2){
    die "usage: perl makelookupfile.pl gene_assiciation_file Affymetrix_annotation_file (optional: output_file; default:stdout)\n";
}

$affyfile=$ARGV[1];
$gofile=$ARGV[0];

%nameshash=();
print STDERR "reading $affyfile\n";
open (INFILE,"<$affyfile") or die "unable to open $affyfile for reading\n";
#I want the *_at number at the beginning and the 9th element
#for the Yeast array I would have to change the regular expression line to sth like this
#if($inline=~/\"(\d+\S*_at)\",\".*?\",\".*?\",\".*?\",\".*?\",\".*?\",\".*?\",\".*?\",\"(.*?).S1\"/io){
#to remove the ".S1" from the gene name and be able to match it to the GO-files.
while($inline=<INFILE>){
    if($inline=~/\"(\S+_at)\",\".*?\",\".*?\",\".*?\",\".*?\",\".*?\",\".*?\",\".*?\",\"(.*?)\"/io){
	$name=$2;
	$at=$1;
	#conver them to lower case to avoid "case" problems
	$name=lc($name);
	$at=lc($at);
	if(defined $nameshash{$name}){
	    my @tmp=@{$nameshash{$name}};
	    push @tmp,$at;
	    $nameshash{$name}=\@tmp;
	}else{
	    my @tmp=();
	    push @tmp,$at;
	    $nameshash{$name}=\@tmp;
	}
    }
}
close INFILE;

#now I know which affy identifiers match which predicted genes. Next, look for those gene-names in the gene-ontology (gene-associations) file.

%gohash=();
print STDERR "reading $gofile\n";
open (INFILE,"<$gofile") or die "unable to open $gofile for reading\n";
#here I want the third element and the GO:number and anything else that has at\dg\d+
#I also need to trim '.1' from the end of the name if present (affy specific)
while($inline=<INFILE>){
    if($inline=~/^\s*\S+\s+\S+\s+(\S+)\s+(GO:\d+)(.*)/io){
	$checkname=$1;
	$checkname=lc($checkname);
	$gonum=$2;
	$gonum=lc($gonum);
	$leftover=$3;
	$leftover=lc($leftover);
	if($checkname=~/^(.*)\.\d+$/o){
	    $checkname=$1;
	    #print "name='$checkname'; go: '$gonum'\n";
	}
	if(defined $gohash{$checkname}){
	    my @tmp=@{$gohash{$checkname}};
	    push @tmp,$gonum;
	    $gohash{$checkname}=\@tmp;
	}else{
	    my @tmp=();
	    push @tmp,$gonum;
	    $gohash{$checkname}=\@tmp;
	}
	while($leftover=~/.*?(at\dg\d+)(.*)/o){
	    #while I find another at\dg\d number...
	    $checkname=$1;
	    $leftover=$2;
	    if($checkname=~/^(.*)\.\d+$/o){
		$checkname=$1;
		#print "name='$checkname'; go: '$gonum'\n";
	    }
	    if(defined $gohash{$checkname}){
		my @tmp=@{$gohash{$checkname}};
		push @tmp,$gonum;
		$gohash{$checkname}=\@tmp;
	    }else{
		my @tmp=();
		push @tmp,$gonum;
		$gohash{$checkname}=\@tmp;
	    }
	}
    }
}
close INFILE;

#now I have both the affy identifiers and the GO-numbers accessible by their (hopefully) common name. Next, print the data

if(scalar @ARGV>2){
    #if I want to print the stuff to a file
    open (OUTFILE,">$ARGV[2]") or die "unable to write to $ARGV[2]\n";
    foreach $key(keys %nameshash){
	@names=@{$nameshash{$key}};
	if(defined $gohash{$key}){
	    @govals=@{$gohash{$key}};
	    foreach $goval(@govals){
		foreach $name(@names){
		    print OUTFILE "$name\t$goval\n";
		}
	    }
	}else{
	    print STDERR "No GO-number defined for '$key' (affy id's: ";
	    foreach $val (@names){
		print STDERR "$val;"
	    }
	    print STDERR ")\n";
	}
    }
    close OUTFILE;
}else{
    #print it to stdout
    foreach $key(keys %nameshash){
	@names=@{$nameshash{$key}};
	if(defined $gohash{$key}){
	    @govals=@{$gohash{$key}};
	    foreach $goval(@govals){
		foreach $name(@names){
		    print "$name\t$goval\n";
		}
	    }
	}else{
	    print STDERR "No GO-number defined for '$key' (affy id's: ";
	    foreach $val (@names){
		print STDERR "$val;"
	    }
	    print STDERR ")\n";
	}
    }
}
