Error[8]: Undefined offset: 4, File: /www/wwwroot/outofmemory.cn/tmp/plugin_ss_superseo_model_superseo.php, Line: 121
File: /www/wwwroot/outofmemory.cn/tmp/plugin_ss_superseo_model_superseo.php, Line: 473, decode(

概述在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。 A perl script which deals  with ncbi raw data,and from which get cds ,pep an

在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。

A perl script which deals  with ncbi raw data,and from which get cds,pep and gene,mRNA and protein ID List.

使用方法如下:

perl    $0    gff_filegenomes_fa_file        file_prefix


#! /usr/bin/perl -w=head1 #######################################	=head1 name	grep_cds_pep_from_ncbi_genomes_datas.pl=head1 description	deal with ncbi raw data,mRNA and protein ID List.=head1 example	perl  [+++] ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz  mole	perl  [+++] gff_file genomes_fa_file  file_prefix=head1 author	original from Xiangfeng li,xflee0608@163.com		##2014-4-19/21##=head1 #######################################=cutuse strict;dIE `pod2text [+++]` unless @ARGV==3;my ($gff,$fa,$prefix)=@ARGV;##deal gff file##$gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||dIE):(open GFF,$gff||dIE);my (%mrna,%cds,%pep);while(<GFF>){	chomp;	next if(/^#/);	my @p=split/\t/,$_;	my @q=split/;/,$p[8];	my ($rna,$pep,$nt,$gene);	my $chr=$p[0];	if($p[2] eq "mRNA"){		($rna=$q[0])=~s/ID=//;		($nt=$q[1])=~s/name=//;		($gene=$q[-3])=~s/gene=//;		$mrna{$chr}{$rna}{strand}=$p[6];		$cds{$rna}=[$gene,$nt];	}        if($p[2] eq "CDS"){                ($pep=$q[1])=~s/name=//;                ($rna=$q[2])=~s/Parent=//;                push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]];                $pep{$rna}=$pep;        }}close GFF;##get ID List##my %anno;my $ID_gene_cds_pep=$prefix."_ID_gene_cds_pep.lst";open ID,">",$ID_gene_cds_pep||dIE;foreach my $i(sort keys %pep){	if($cds{$i}){		my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i};		print ID $out,"\n";		($anno{$i}=$out)=~s/\t/\|/g;	}}warn "create file:$ID_gene_cds_pep\n";close ID;##deal fa file##my %max;$fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||dIE):(open FA,$fa||dIE);my $raw_cds=$prefix."_raw_cds";open CDS1,">$raw_cds"||dIE;my ($start,$end);$/=">";<FA>;$/="\n";while(<FA>){        my $name= if(/(\S+)/);	my $info=(split/\|/,$name)[-1];        $/=">";        my $seq=<FA>;        $/="\n";        $seq=~s/>|\n+//g;	my $scaf=$mrna{$info};	foreach my $k(sort keys %$scaf){		next if(! exists $scaf->{$k}{nt});		my @p=@{$scaf->{$k}{nt}};		my $strand=$$scaf{$k}{strand};		my $get;		@p=sort{$a->[0]<=>$b->[0]}@p;		my $loc1=$p[0][0];		my $loc2=$p[-1][1];		my ($get_len,$add,$out,$gene);		if(exists $anno{$k}){			$add=$anno{$k}; 			$gene=(split/\|/,$add)[1];		}else{next;}		foreach(@p){			($start,$end)=@$_[0,1];			$get.=uc(substr($seq,$start-1,$end-$start+1));		}		if($strand eq "+"){			$get_len=length$get;			$get=~s/([A-Z]{50})/\n/g;			chop($get) unless($get_len%50);			$out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n";			push @{$max{$gene}},[$get_len,$out];			print CDS1 $out;		}		if($strand eq "-"){			$get=&reverse_complement($get);			$get_len=length$get;			$get=~s/([A-Z]{50})/\n/g;			chop($get) unless($get_len%50);			$out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n";			push @{$max{$gene}},$out];			print CDS1 $out;		}	}}warn "create file:$raw_cds\n";close FA;close CDS1;##get max transcript##my $filter_cds=$prefix."_filter_cds";open CDS2,">$filter_cds"||dIE;my @a;foreach my $j(keys %max){	my @trans=@{$max{$j}};	@trans=sort{$a->[0] cmp $b->[0]}@trans;	push @a,$trans[-1][1];}my @a_new;foreach(@a){	my $r= if(/^>rna(\d+)/);	push @a_new,[$r,$_];}my @cds_sort=map{$_->[1]}		sort{$a->[0] <=> $b->[0]}@a_new;print CDS2 $_ for@cds_sort;close CDS2;warn "create file:$filter_cds\n";##get raw pep sequences##my $raw_pep=$prefix."_raw_pep";open PEP1,$raw_pep||dIE;my @raw_pep=&cds2pep($raw_cds);print PEP1 $_ for@raw_pep;close PEP1;warn "create file:$raw_pep\n";##get filter pep sequences##my $filter_pep=$prefix."_filter_pep";open PEP2,">$filter_pep"||dIE;my @filter_pep=&cds2pep($filter_cds);print PEP2 $_ for@filter_pep;close PEP2;warn "create file:$filter_pep\n";##add label for cds and pep of filter##my $label=uc($prefix);open IN1,$filter_cds||dIE;my $filter_cds_label=$prefix."_filter_cds_label";open OUT1,$filter_cds_label||dIE;while(<IN1>){	chomp;	if(/^>/){		my @a=split/\|/,$_,2;		my $name=$a[0]."_$label";		print OUT1"$name |$a[1]\n";	}else{print OUT1 "$_\n";}}close IN1;close OUT1;warn "create file:$filter_cds_label\n";open IN2,$filter_pep||dIE;my $filter_pep_label=$prefix."_filter_pep_label";open OUT2,$filter_pep_label||dIE;while(<IN2>){        chomp;        if(/^>/){                my @a=split/\|/,2;                my $name=$a[0]."_$label";                print OUT2 "$name |$a[1]\n";        }else{print OUT2 "$_\n";}}close IN2;close OUT2;warn "create file:$filter_pep_label\n";##timing##my $time=times;my $time_out=sprintf "%.2f",$time/60;print "##########\nElapsed Time :$time_out mins\n##########\n";##subroutine##sub reverse_complement{	my ($seq)=shift;	$seq=reverse$seq;	$seq=~tr/AaGgCcTt/TtCcGgAa/;	return $seq;}##subroutine##sub cds2pep{	my $file=shift;	my %code = (                        "standard" =>                                {                                'GCA' => 'A','GCC' => 'A','GCG' => 'A','GCT' => 'A',# Alanine                                'TGC' => 'C','TGT' => 'C',# Cysteine                                'GAC' => 'D','GAT' => 'D',# Aspartic Aci                                'GAA' => 'E','GAG' => 'E',# glutamic Aci                                'TTC' => 'F','TTT' => 'F',# Phenylalanin                                'GGA' => 'G','GGC' => 'G','GGG' => 'G','GGT' => 'G',# Glycine                                'CAC' => 'H','CAT' => 'H',# HistIDine                                'ATA' => 'I','ATC' => 'I','ATT' => 'I',# Isoleucine                                'AAA' => 'K','AAG' => 'K',# Lysine                                'CTA' => 'L','CTC' => 'L','CTG' => 'L','CTT' => 'L','TTA' => 'L','TTG' => 'L',# Leucine                                'ATG' => 'M',# Methionine                                'AAC' => 'N','AAT' => 'N',# Asparagine                                'CCA' => 'P','CCC' => 'P','CCG' => 'P','CCT' => 'P',# Proline                                'CAA' => 'Q','CAG' => 'Q',# glutamine                                'CGA' => 'R','CGC' => 'R','CGG' => 'R','CGT' => 'R','AGA' => 'R','AGG' => 'R',# Arginine                                'TCA' => 'S','TCC' => 'S','TCG' => 'S','TCT' => 'S','AGC' => 'S','AGT' => 'S',# Serine                                'ACA' => 'T','ACC' => 'T','ACG' => 'T','ACT' => 'T',# Threonine                                'GTA' => 'V','GTC' => 'V','GTG' => 'V','GTT' => 'V',# Valine                                'TGG' => 'W',# Tryptophan                                'TAC' => 'Y','TAT' => 'Y',# Tyrosine                                'TAA' => 'U','TAG' => 'U','TGA' => 'U'                                              # Stop                                }                        ## more translate table Could be added here in future                        ## more translate table Could be added here in future                        ## more translate table Could be added here in future        );	open IN,$file||dIE;	$/=">";<IN>;$/="\n";	my @results_set;	while(<IN>){		my $info=$_;        	my @a=split/\s+/,$info;        	$/=">";my $seq=<IN>;$/="\n";        	$seq=~s/\n|>//g;        	my $len=length$seq;        	my $info_out=join " ",@a[0..($#a-1)];        	my ($pep_out,$triplet);        	for(my $i=0;$i<$len;$i+=3){                	$triplet=substr($seq,3);                	next if(length$triplet!=3);                	if(exists $code{standard}{$triplet}){                        	$pep_out.=$code{standard}{$triplet};                	}else{$pep_out.="X"}        	}        	$pep_out=~s/U$// if($pep_out=~/U$/);        	my $pep_len=length$pep_out;        	$pep_out=~s/([A-Z]{50})/\n/g;        	chop($pep_out) unless($pep_len%50);        	my $results= ">$info_out length=$pep_len\n$pep_out\n";		push @results_set,$results;	}	return @results_set;}__END__
总结

以上是内存溢出为你收集整理的从NCBI基因组数据中获得cds,pep和geneID对应表全部内容,希望文章能够帮你解决从NCBI基因组数据中获得cds,pep和geneID对应表所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

)
File: /www/wwwroot/outofmemory.cn/tmp/route_read.php, Line: 126, InsideLink()
File: /www/wwwroot/outofmemory.cn/tmp/index.inc.php, Line: 166, include(/www/wwwroot/outofmemory.cn/tmp/route_read.php)
File: /www/wwwroot/outofmemory.cn/index.php, Line: 30, include(/www/wwwroot/outofmemory.cn/tmp/index.inc.php)
Error[8]: Undefined offset: 5, File: /www/wwwroot/outofmemory.cn/tmp/plugin_ss_superseo_model_superseo.php, Line: 121
File: /www/wwwroot/outofmemory.cn/tmp/plugin_ss_superseo_model_superseo.php, Line: 473, decode(

概述在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。 A perl script which deals  with ncbi raw data,and from which get cds ,pep an

在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。

A perl script which deals  with ncbi raw data,and from which get cds,pep and gene,mRNA and protein ID List.

使用方法如下:

perl    $0    gff_filegenomes_fa_file        file_prefix


#! /usr/bin/perl -w=head1 #######################################	=head1 name	grep_cds_pep_from_ncbi_genomes_datas.pl=head1 description	deal with ncbi raw data,mRNA and protein ID List.=head1 example	perl   ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz  mole	perl  [+++] gff_file genomes_fa_file  file_prefix=head1 author	original from Xiangfeng li,xflee0608@163.com		##2014-4-19/21##=head1 #######################################=cutuse strict;dIE `pod2text [+++]` unless @ARGV==3;my ($gff,$fa,$prefix)=@ARGV;##deal gff file##$gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||dIE):(open GFF,$gff||dIE);my (%mrna,%cds,%pep);while(<GFF>){	chomp;	next if(/^#/);	my @p=split/\t/,$_;	my @q=split/;/,$p[8];	my ($rna,$pep,$nt,$gene);	my $chr=$p[0];	if($p[2] eq "mRNA"){		($rna=$q[0])=~s/ID=//;		($nt=$q[1])=~s/name=//;		($gene=$q[-3])=~s/gene=//;		$mrna{$chr}{$rna}{strand}=$p[6];		$cds{$rna}=[$gene,$nt];	}        if($p[2] eq "CDS"){                ($pep=$q[1])=~s/name=//;                ($rna=$q[2])=~s/Parent=//;                push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]];                $pep{$rna}=$pep;        }}close GFF;##get ID List##my %anno;my $ID_gene_cds_pep=$prefix."_ID_gene_cds_pep.lst";open ID,">",$ID_gene_cds_pep||dIE;foreach my $i(sort keys %pep){	if($cds{$i}){		my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i};		print ID $out,"\n";		($anno{$i}=$out)=~s/\t/\|/g;	}}warn "create file:$ID_gene_cds_pep\n";close ID;##deal fa file##my %max;$fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||dIE):(open FA,$fa||dIE);my $raw_cds=$prefix."_raw_cds";open CDS1,">$raw_cds"||dIE;my ($start,$end);$/=">";<FA>;$/="\n";while(<FA>){        my $name= if(/(\S+)/);	my $info=(split/\|/,$name)[-1];        $/=">";        my $seq=<FA>;        $/="\n";        $seq=~s/>|\n+//g;	my $scaf=$mrna{$info};	foreach my $k(sort keys %$scaf){		next if(! exists $scaf->{$k}{nt});		my @p=@{$scaf->{$k}{nt}};		my $strand=$$scaf{$k}{strand};		my $get;		@p=sort{$a->[0]<=>$b->[0]}@p;		my $loc1=$p[0][0];		my $loc2=$p[-1][1];		my ($get_len,$add,$out,$gene);		if(exists $anno{$k}){			$add=$anno{$k}; 			$gene=(split/\|/,$add)[1];		}else{next;}		foreach(@p){			($start,$end)=@$_[0,1];			$get.=uc(substr($seq,$start-1,$end-$start+1));		}		if($strand eq "+"){			$get_len=length$get;			$get=~s/([A-Z]{50})/\n/g;			chop($get) unless($get_len%50);			$out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n";			push @{$max{$gene}},[$get_len,$out];			print CDS1 $out;		}		if($strand eq "-"){			$get=&reverse_complement($get);			$get_len=length$get;			$get=~s/([A-Z]{50})/\n/g;			chop($get) unless($get_len%50);			$out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n";			push @{$max{$gene}},$out];			print CDS1 $out;		}	}}warn "create file:$raw_cds\n";close FA;close CDS1;##get max transcript##my $filter_cds=$prefix."_filter_cds";open CDS2,">$filter_cds"||dIE;my @a;foreach my $j(keys %max){	my @trans=@{$max{$j}};	@trans=sort{$a->[0] cmp $b->[0]}@trans;	push @a,$trans[-1][1];}my @a_new;foreach(@a){	my $r= if(/^>rna(\d+)/);	push @a_new,[$r,$_];}my @cds_sort=map{$_->[1]}		sort{$a->[0] <=> $b->[0]}@a_new;print CDS2 $_ for@cds_sort;close CDS2;warn "create file:$filter_cds\n";##get raw pep sequences##my $raw_pep=$prefix."_raw_pep";open PEP1,$raw_pep||dIE;my @raw_pep=&cds2pep($raw_cds);print PEP1 $_ for@raw_pep;close PEP1;warn "create file:$raw_pep\n";##get filter pep sequences##my $filter_pep=$prefix."_filter_pep";open PEP2,">$filter_pep"||dIE;my @filter_pep=&cds2pep($filter_cds);print PEP2 $_ for@filter_pep;close PEP2;warn "create file:$filter_pep\n";##add label for cds and pep of filter##my $label=uc($prefix);open IN1,$filter_cds||dIE;my $filter_cds_label=$prefix."_filter_cds_label";open OUT1,$filter_cds_label||dIE;while(<IN1>){	chomp;	if(/^>/){		my @a=split/\|/,$_,2;		my $name=$a[0]."_$label";		print OUT1"$name |$a[1]\n";	}else{print OUT1 "$_\n";}}close IN1;close OUT1;warn "create file:$filter_cds_label\n";open IN2,$filter_pep||dIE;my $filter_pep_label=$prefix."_filter_pep_label";open OUT2,$filter_pep_label||dIE;while(<IN2>){        chomp;        if(/^>/){                my @a=split/\|/,2;                my $name=$a[0]."_$label";                print OUT2 "$name |$a[1]\n";        }else{print OUT2 "$_\n";}}close IN2;close OUT2;warn "create file:$filter_pep_label\n";##timing##my $time=times;my $time_out=sprintf "%.2f",$time/60;print "##########\nElapsed Time :$time_out mins\n##########\n";##subroutine##sub reverse_complement{	my ($seq)=shift;	$seq=reverse$seq;	$seq=~tr/AaGgCcTt/TtCcGgAa/;	return $seq;}##subroutine##sub cds2pep{	my $file=shift;	my %code = (                        "standard" =>                                {                                'GCA' => 'A','GCC' => 'A','GCG' => 'A','GCT' => 'A',# Alanine                                'TGC' => 'C','TGT' => 'C',# Cysteine                                'GAC' => 'D','GAT' => 'D',# Aspartic Aci                                'GAA' => 'E','GAG' => 'E',# glutamic Aci                                'TTC' => 'F','TTT' => 'F',# Phenylalanin                                'GGA' => 'G','GGC' => 'G','GGG' => 'G','GGT' => 'G',# Glycine                                'CAC' => 'H','CAT' => 'H',# HistIDine                                'ATA' => 'I','ATC' => 'I','ATT' => 'I',# Isoleucine                                'AAA' => 'K','AAG' => 'K',# Lysine                                'CTA' => 'L','CTC' => 'L','CTG' => 'L','CTT' => 'L','TTA' => 'L','TTG' => 'L',# Leucine                                'ATG' => 'M',# Methionine                                'AAC' => 'N','AAT' => 'N',# Asparagine                                'CCA' => 'P','CCC' => 'P','CCG' => 'P','CCT' => 'P',# Proline                                'CAA' => 'Q','CAG' => 'Q',# glutamine                                'CGA' => 'R','CGC' => 'R','CGG' => 'R','CGT' => 'R','AGA' => 'R','AGG' => 'R',# Arginine                                'TCA' => 'S','TCC' => 'S','TCG' => 'S','TCT' => 'S','AGC' => 'S','AGT' => 'S',# Serine                                'ACA' => 'T','ACC' => 'T','ACG' => 'T','ACT' => 'T',# Threonine                                'GTA' => 'V','GTC' => 'V','GTG' => 'V','GTT' => 'V',# Valine                                'TGG' => 'W',# Tryptophan                                'TAC' => 'Y','TAT' => 'Y',# Tyrosine                                'TAA' => 'U','TAG' => 'U','TGA' => 'U'                                              # Stop                                }                        ## more translate table Could be added here in future                        ## more translate table Could be added here in future                        ## more translate table Could be added here in future        );	open IN,$file||dIE;	$/=">";<IN>;$/="\n";	my @results_set;	while(<IN>){		my $info=$_;        	my @a=split/\s+/,$info;        	$/=">";my $seq=<IN>;$/="\n";        	$seq=~s/\n|>//g;        	my $len=length$seq;        	my $info_out=join " ",@a[0..($#a-1)];        	my ($pep_out,$triplet);        	for(my $i=0;$i<$len;$i+=3){                	$triplet=substr($seq,3);                	next if(length$triplet!=3);                	if(exists $code{standard}{$triplet}){                        	$pep_out.=$code{standard}{$triplet};                	}else{$pep_out.="X"}        	}        	$pep_out=~s/U$// if($pep_out=~/U$/);        	my $pep_len=length$pep_out;        	$pep_out=~s/([A-Z]{50})/\n/g;        	chop($pep_out) unless($pep_len%50);        	my $results= ">$info_out length=$pep_len\n$pep_out\n";		push @results_set,$results;	}	return @results_set;}__END__
总结

以上是内存溢出为你收集整理的从NCBI基因组数据中获得cds,pep和geneID对应表全部内容,希望文章能够帮你解决从NCBI基因组数据中获得cds,pep和geneID对应表所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

)
File: /www/wwwroot/outofmemory.cn/tmp/route_read.php, Line: 126, InsideLink()
File: /www/wwwroot/outofmemory.cn/tmp/index.inc.php, Line: 166, include(/www/wwwroot/outofmemory.cn/tmp/route_read.php)
File: /www/wwwroot/outofmemory.cn/index.php, Line: 30, include(/www/wwwroot/outofmemory.cn/tmp/index.inc.php)
Error[8]: Undefined offset: 6, File: /www/wwwroot/outofmemory.cn/tmp/plugin_ss_superseo_model_superseo.php, Line: 121
File: /www/wwwroot/outofmemory.cn/tmp/plugin_ss_superseo_model_superseo.php, Line: 473, decode(

概述在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。 A perl script which deals  with ncbi raw data,and from which get cds ,pep an

在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。

A perl script which deals  with ncbi raw data,and from which get cds,pep and gene,mRNA and protein ID List.

使用方法如下:

perl    $0    gff_filegenomes_fa_file        file_prefix


#! /usr/bin/perl -w=head1 #######################################	=head1 name	grep_cds_pep_from_ncbi_genomes_datas.pl=head1 description	deal with ncbi raw data,mRNA and protein ID List.=head1 example	perl   ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz  mole	perl   gff_file genomes_fa_file  file_prefix=head1 author	original from Xiangfeng li,xflee0608@163.com		##2014-4-19/21##=head1 #######################################=cutuse strict;dIE `pod2text [+++]` unless @ARGV==3;my ($gff,$fa,$prefix)=@ARGV;##deal gff file##$gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||dIE):(open GFF,$gff||dIE);my (%mrna,%cds,%pep);while(<GFF>){	chomp;	next if(/^#/);	my @p=split/\t/,$_;	my @q=split/;/,$p[8];	my ($rna,$pep,$nt,$gene);	my $chr=$p[0];	if($p[2] eq "mRNA"){		($rna=$q[0])=~s/ID=//;		($nt=$q[1])=~s/name=//;		($gene=$q[-3])=~s/gene=//;		$mrna{$chr}{$rna}{strand}=$p[6];		$cds{$rna}=[$gene,$nt];	}        if($p[2] eq "CDS"){                ($pep=$q[1])=~s/name=//;                ($rna=$q[2])=~s/Parent=//;                push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]];                $pep{$rna}=$pep;        }}close GFF;##get ID List##my %anno;my $ID_gene_cds_pep=$prefix."_ID_gene_cds_pep.lst";open ID,">",$ID_gene_cds_pep||dIE;foreach my $i(sort keys %pep){	if($cds{$i}){		my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i};		print ID $out,"\n";		($anno{$i}=$out)=~s/\t/\|/g;	}}warn "create file:$ID_gene_cds_pep\n";close ID;##deal fa file##my %max;$fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||dIE):(open FA,$fa||dIE);my $raw_cds=$prefix."_raw_cds";open CDS1,">$raw_cds"||dIE;my ($start,$end);$/=">";<FA>;$/="\n";while(<FA>){        my $name= if(/(\S+)/);	my $info=(split/\|/,$name)[-1];        $/=">";        my $seq=<FA>;        $/="\n";        $seq=~s/>|\n+//g;	my $scaf=$mrna{$info};	foreach my $k(sort keys %$scaf){		next if(! exists $scaf->{$k}{nt});		my @p=@{$scaf->{$k}{nt}};		my $strand=$$scaf{$k}{strand};		my $get;		@p=sort{$a->[0]<=>$b->[0]}@p;		my $loc1=$p[0][0];		my $loc2=$p[-1][1];		my ($get_len,$add,$out,$gene);		if(exists $anno{$k}){			$add=$anno{$k}; 			$gene=(split/\|/,$add)[1];		}else{next;}		foreach(@p){			($start,$end)=@$_[0,1];			$get.=uc(substr($seq,$start-1,$end-$start+1));		}		if($strand eq "+"){			$get_len=length$get;			$get=~s/([A-Z]{50})/\n/g;			chop($get) unless($get_len%50);			$out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n";			push @{$max{$gene}},[$get_len,$out];			print CDS1 $out;		}		if($strand eq "-"){			$get=&reverse_complement($get);			$get_len=length$get;			$get=~s/([A-Z]{50})/\n/g;			chop($get) unless($get_len%50);			$out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n";			push @{$max{$gene}},$out];			print CDS1 $out;		}	}}warn "create file:$raw_cds\n";close FA;close CDS1;##get max transcript##my $filter_cds=$prefix."_filter_cds";open CDS2,">$filter_cds"||dIE;my @a;foreach my $j(keys %max){	my @trans=@{$max{$j}};	@trans=sort{$a->[0] cmp $b->[0]}@trans;	push @a,$trans[-1][1];}my @a_new;foreach(@a){	my $r= if(/^>rna(\d+)/);	push @a_new,[$r,$_];}my @cds_sort=map{$_->[1]}		sort{$a->[0] <=> $b->[0]}@a_new;print CDS2 $_ for@cds_sort;close CDS2;warn "create file:$filter_cds\n";##get raw pep sequences##my $raw_pep=$prefix."_raw_pep";open PEP1,$raw_pep||dIE;my @raw_pep=&cds2pep($raw_cds);print PEP1 $_ for@raw_pep;close PEP1;warn "create file:$raw_pep\n";##get filter pep sequences##my $filter_pep=$prefix."_filter_pep";open PEP2,">$filter_pep"||dIE;my @filter_pep=&cds2pep($filter_cds);print PEP2 $_ for@filter_pep;close PEP2;warn "create file:$filter_pep\n";##add label for cds and pep of filter##my $label=uc($prefix);open IN1,$filter_cds||dIE;my $filter_cds_label=$prefix."_filter_cds_label";open OUT1,$filter_cds_label||dIE;while(<IN1>){	chomp;	if(/^>/){		my @a=split/\|/,$_,2;		my $name=$a[0]."_$label";		print OUT1"$name |$a[1]\n";	}else{print OUT1 "$_\n";}}close IN1;close OUT1;warn "create file:$filter_cds_label\n";open IN2,$filter_pep||dIE;my $filter_pep_label=$prefix."_filter_pep_label";open OUT2,$filter_pep_label||dIE;while(<IN2>){        chomp;        if(/^>/){                my @a=split/\|/,2;                my $name=$a[0]."_$label";                print OUT2 "$name |$a[1]\n";        }else{print OUT2 "$_\n";}}close IN2;close OUT2;warn "create file:$filter_pep_label\n";##timing##my $time=times;my $time_out=sprintf "%.2f",$time/60;print "##########\nElapsed Time :$time_out mins\n##########\n";##subroutine##sub reverse_complement{	my ($seq)=shift;	$seq=reverse$seq;	$seq=~tr/AaGgCcTt/TtCcGgAa/;	return $seq;}##subroutine##sub cds2pep{	my $file=shift;	my %code = (                        "standard" =>                                {                                'GCA' => 'A','GCC' => 'A','GCG' => 'A','GCT' => 'A',# Alanine                                'TGC' => 'C','TGT' => 'C',# Cysteine                                'GAC' => 'D','GAT' => 'D',# Aspartic Aci                                'GAA' => 'E','GAG' => 'E',# glutamic Aci                                'TTC' => 'F','TTT' => 'F',# Phenylalanin                                'GGA' => 'G','GGC' => 'G','GGG' => 'G','GGT' => 'G',# Glycine                                'CAC' => 'H','CAT' => 'H',# HistIDine                                'ATA' => 'I','ATC' => 'I','ATT' => 'I',# Isoleucine                                'AAA' => 'K','AAG' => 'K',# Lysine                                'CTA' => 'L','CTC' => 'L','CTG' => 'L','CTT' => 'L','TTA' => 'L','TTG' => 'L',# Leucine                                'ATG' => 'M',# Methionine                                'AAC' => 'N','AAT' => 'N',# Asparagine                                'CCA' => 'P','CCC' => 'P','CCG' => 'P','CCT' => 'P',# Proline                                'CAA' => 'Q','CAG' => 'Q',# glutamine                                'CGA' => 'R','CGC' => 'R','CGG' => 'R','CGT' => 'R','AGA' => 'R','AGG' => 'R',# Arginine                                'TCA' => 'S','TCC' => 'S','TCG' => 'S','TCT' => 'S','AGC' => 'S','AGT' => 'S',# Serine                                'ACA' => 'T','ACC' => 'T','ACG' => 'T','ACT' => 'T',# Threonine                                'GTA' => 'V','GTC' => 'V','GTG' => 'V','GTT' => 'V',# Valine                                'TGG' => 'W',# Tryptophan                                'TAC' => 'Y','TAT' => 'Y',# Tyrosine                                'TAA' => 'U','TAG' => 'U','TGA' => 'U'                                              # Stop                                }                        ## more translate table Could be added here in future                        ## more translate table Could be added here in future                        ## more translate table Could be added here in future        );	open IN,$file||dIE;	$/=">";<IN>;$/="\n";	my @results_set;	while(<IN>){		my $info=$_;        	my @a=split/\s+/,$info;        	$/=">";my $seq=<IN>;$/="\n";        	$seq=~s/\n|>//g;        	my $len=length$seq;        	my $info_out=join " ",@a[0..($#a-1)];        	my ($pep_out,$triplet);        	for(my $i=0;$i<$len;$i+=3){                	$triplet=substr($seq,3);                	next if(length$triplet!=3);                	if(exists $code{standard}{$triplet}){                        	$pep_out.=$code{standard}{$triplet};                	}else{$pep_out.="X"}        	}        	$pep_out=~s/U$// if($pep_out=~/U$/);        	my $pep_len=length$pep_out;        	$pep_out=~s/([A-Z]{50})/\n/g;        	chop($pep_out) unless($pep_len%50);        	my $results= ">$info_out length=$pep_len\n$pep_out\n";		push @results_set,$results;	}	return @results_set;}__END__
总结

以上是内存溢出为你收集整理的从NCBI基因组数据中获得cds,pep和geneID对应表全部内容,希望文章能够帮你解决从NCBI基因组数据中获得cds,pep和geneID对应表所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

)
File: /www/wwwroot/outofmemory.cn/tmp/route_read.php, Line: 126, InsideLink()
File: /www/wwwroot/outofmemory.cn/tmp/index.inc.php, Line: 166, include(/www/wwwroot/outofmemory.cn/tmp/route_read.php)
File: /www/wwwroot/outofmemory.cn/index.php, Line: 30, include(/www/wwwroot/outofmemory.cn/tmp/index.inc.php)
从NCBI基因组数据中获得cds,pep和geneID对应表_语言综合_内存溢出

从NCBI基因组数据中获得cds,pep和geneID对应表

从NCBI基因组数据中获得cds,pep和geneID对应表,第1张

概述在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。 A perl script which deals  with ncbi raw data,and from which get cds ,pep an

在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。

A perl script which deals  with ncbi raw data,and from which get cds,pep and gene,mRNA and protein ID List.

使用方法如下:

perl    $0    gff_filegenomes_fa_file        file_prefix


#! /usr/bin/perl -w=head1 #######################################	=head1 name	grep_cds_pep_from_ncbi_genomes_datas.pl=head1 description	deal with ncbi raw data,mRNA and protein ID List.=head1 example	perl   ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz  mole	perl   gff_file genomes_fa_file  file_prefix=head1 author	original from Xiangfeng li,xflee0608@163.com		##2014-4-19/21##=head1 #######################################=cutuse strict;dIE `pod2text ` unless @ARGV==3;my ($gff,$fa,$prefix)=@ARGV;##deal gff file##$gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||dIE):(open GFF,$gff||dIE);my (%mrna,%cds,%pep);while(<GFF>){	chomp;	next if(/^#/);	my @p=split/\t/,$_;	my @q=split/;/,$p[8];	my ($rna,$pep,$nt,$gene);	my $chr=$p[0];	if($p[2] eq "mRNA"){		($rna=$q[0])=~s/ID=//;		($nt=$q[1])=~s/name=//;		($gene=$q[-3])=~s/gene=//;		$mrna{$chr}{$rna}{strand}=$p[6];		$cds{$rna}=[$gene,$nt];	}        if($p[2] eq "CDS"){                ($pep=$q[1])=~s/name=//;                ($rna=$q[2])=~s/Parent=//;                push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]];                $pep{$rna}=$pep;        }}close GFF;##get ID List##my %anno;my $ID_gene_cds_pep=$prefix."_ID_gene_cds_pep.lst";open ID,">",$ID_gene_cds_pep||dIE;foreach my $i(sort keys %pep){	if($cds{$i}){		my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i};		print ID $out,"\n";		($anno{$i}=$out)=~s/\t/\|/g;	}}warn "create file:$ID_gene_cds_pep\n";close ID;##deal fa file##my %max;$fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||dIE):(open FA,$fa||dIE);my $raw_cds=$prefix."_raw_cds";open CDS1,">$raw_cds"||dIE;my ($start,$end);$/=">";<FA>;$/="\n";while(<FA>){        my $name= if(/(\S+)/);	my $info=(split/\|/,$name)[-1];        $/=">";        my $seq=<FA>;        $/="\n";        $seq=~s/>|\n+//g;	my $scaf=$mrna{$info};	foreach my $k(sort keys %$scaf){		next if(! exists $scaf->{$k}{nt});		my @p=@{$scaf->{$k}{nt}};		my $strand=$$scaf{$k}{strand};		my $get;		@p=sort{$a->[0]<=>$b->[0]}@p;		my $loc1=$p[0][0];		my $loc2=$p[-1][1];		my ($get_len,$add,$out,$gene);		if(exists $anno{$k}){			$add=$anno{$k}; 			$gene=(split/\|/,$add)[1];		}else{next;}		foreach(@p){			($start,$end)=@$_[0,1];			$get.=uc(substr($seq,$start-1,$end-$start+1));		}		if($strand eq "+"){			$get_len=length$get;			$get=~s/([A-Z]{50})/\n/g;			chop($get) unless($get_len%50);			$out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n";			push @{$max{$gene}},[$get_len,$out];			print CDS1 $out;		}		if($strand eq "-"){			$get=&reverse_complement($get);			$get_len=length$get;			$get=~s/([A-Z]{50})/\n/g;			chop($get) unless($get_len%50);			$out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n";			push @{$max{$gene}},$out];			print CDS1 $out;		}	}}warn "create file:$raw_cds\n";close FA;close CDS1;##get max transcript##my $filter_cds=$prefix."_filter_cds";open CDS2,">$filter_cds"||dIE;my @a;foreach my $j(keys %max){	my @trans=@{$max{$j}};	@trans=sort{$a->[0] cmp $b->[0]}@trans;	push @a,$trans[-1][1];}my @a_new;foreach(@a){	my $r= if(/^>rna(\d+)/);	push @a_new,[$r,$_];}my @cds_sort=map{$_->[1]}		sort{$a->[0] <=> $b->[0]}@a_new;print CDS2 $_ for@cds_sort;close CDS2;warn "create file:$filter_cds\n";##get raw pep sequences##my $raw_pep=$prefix."_raw_pep";open PEP1,$raw_pep||dIE;my @raw_pep=&cds2pep($raw_cds);print PEP1 $_ for@raw_pep;close PEP1;warn "create file:$raw_pep\n";##get filter pep sequences##my $filter_pep=$prefix."_filter_pep";open PEP2,">$filter_pep"||dIE;my @filter_pep=&cds2pep($filter_cds);print PEP2 $_ for@filter_pep;close PEP2;warn "create file:$filter_pep\n";##add label for cds and pep of filter##my $label=uc($prefix);open IN1,$filter_cds||dIE;my $filter_cds_label=$prefix."_filter_cds_label";open OUT1,$filter_cds_label||dIE;while(<IN1>){	chomp;	if(/^>/){		my @a=split/\|/,$_,2;		my $name=$a[0]."_$label";		print OUT1"$name |$a[1]\n";	}else{print OUT1 "$_\n";}}close IN1;close OUT1;warn "create file:$filter_cds_label\n";open IN2,$filter_pep||dIE;my $filter_pep_label=$prefix."_filter_pep_label";open OUT2,$filter_pep_label||dIE;while(<IN2>){        chomp;        if(/^>/){                my @a=split/\|/,2;                my $name=$a[0]."_$label";                print OUT2 "$name |$a[1]\n";        }else{print OUT2 "$_\n";}}close IN2;close OUT2;warn "create file:$filter_pep_label\n";##timing##my $time=times;my $time_out=sprintf "%.2f",$time/60;print "##########\nElapsed Time :$time_out mins\n##########\n";##subroutine##sub reverse_complement{	my ($seq)=shift;	$seq=reverse$seq;	$seq=~tr/AaGgCcTt/TtCcGgAa/;	return $seq;}##subroutine##sub cds2pep{	my $file=shift;	my %code = (                        "standard" =>                                {                                'GCA' => 'A','GCC' => 'A','GCG' => 'A','GCT' => 'A',# Alanine                                'TGC' => 'C','TGT' => 'C',# Cysteine                                'GAC' => 'D','GAT' => 'D',# Aspartic Aci                                'GAA' => 'E','GAG' => 'E',# glutamic Aci                                'TTC' => 'F','TTT' => 'F',# Phenylalanin                                'GGA' => 'G','GGC' => 'G','GGG' => 'G','GGT' => 'G',# Glycine                                'CAC' => 'H','CAT' => 'H',# HistIDine                                'ATA' => 'I','ATC' => 'I','ATT' => 'I',# Isoleucine                                'AAA' => 'K','AAG' => 'K',# Lysine                                'CTA' => 'L','CTC' => 'L','CTG' => 'L','CTT' => 'L','TTA' => 'L','TTG' => 'L',# Leucine                                'ATG' => 'M',# Methionine                                'AAC' => 'N','AAT' => 'N',# Asparagine                                'CCA' => 'P','CCC' => 'P','CCG' => 'P','CCT' => 'P',# Proline                                'CAA' => 'Q','CAG' => 'Q',# glutamine                                'CGA' => 'R','CGC' => 'R','CGG' => 'R','CGT' => 'R','AGA' => 'R','AGG' => 'R',# Arginine                                'TCA' => 'S','TCC' => 'S','TCG' => 'S','TCT' => 'S','AGC' => 'S','AGT' => 'S',# Serine                                'ACA' => 'T','ACC' => 'T','ACG' => 'T','ACT' => 'T',# Threonine                                'GTA' => 'V','GTC' => 'V','GTG' => 'V','GTT' => 'V',# Valine                                'TGG' => 'W',# Tryptophan                                'TAC' => 'Y','TAT' => 'Y',# Tyrosine                                'TAA' => 'U','TAG' => 'U','TGA' => 'U'                                              # Stop                                }                        ## more translate table Could be added here in future                        ## more translate table Could be added here in future                        ## more translate table Could be added here in future        );	open IN,$file||dIE;	$/=">";<IN>;$/="\n";	my @results_set;	while(<IN>){		my $info=$_;        	my @a=split/\s+/,$info;        	$/=">";my $seq=<IN>;$/="\n";        	$seq=~s/\n|>//g;        	my $len=length$seq;        	my $info_out=join " ",@a[0..($#a-1)];        	my ($pep_out,$triplet);        	for(my $i=0;$i<$len;$i+=3){                	$triplet=substr($seq,3);                	next if(length$triplet!=3);                	if(exists $code{standard}{$triplet}){                        	$pep_out.=$code{standard}{$triplet};                	}else{$pep_out.="X"}        	}        	$pep_out=~s/U$// if($pep_out=~/U$/);        	my $pep_len=length$pep_out;        	$pep_out=~s/([A-Z]{50})/\n/g;        	chop($pep_out) unless($pep_len%50);        	my $results= ">$info_out length=$pep_len\n$pep_out\n";		push @results_set,$results;	}	return @results_set;}__END__
总结

以上是内存溢出为你收集整理的从NCBI基因组数据中获得cds,pep和geneID对应表全部内容,希望文章能够帮你解决从NCBI基因组数据中获得cds,pep和geneID对应表所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://www.outofmemory.cn/langs/1276265.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-09
下一篇 2022-06-09

发表评论

登录后才能评论

评论列表(0条)

保存