http://www.ccg.unam.mx/~vinuesa/filoinfo_IE11/scripts/oneliners2process_fastas.txt
### Learning to use Perl one-liners to manipulate fasta files and their headers
# AUTHOR: Pablo Vinuesa, Centro de Ciencias Genomicas, UNAM, Mexico, Sept 7th, 2011.
# A practical example with sequences fetched from GenBank using the ENTREZNT system, as explained below
# Example prepared for the students at the Bachelor's of Genomic Sciences, Campus Morelos, UNAM
# and the participants of the Latinamerican Workshop on Molecular Evolution, TLEM 2011
# process fasta fetched using ENTREZNT Bradyrhizobium[orgn] AND rpoB[gene] AND vinuesa[auth] AND 550:1500[slen]
perl -pe 's/gi\|//; s/\|.*\|//; s/ RNA.*/\]/; s/ strain//; s/Brady/\[Brady/' rpoB_Bradys_vinuesa.fasta | grep '>'
perl -pe 's/gi\|//; s/\|.*\|//; s/ RNA.*/\]/; s/ strain//; s/Brady/\[Brady/' rpoB_Bradys_vinuesa.fasta > rpoB_Bradys_vinuesa.fastaed
# get the list of taxa
grep '>' rpoB_Bradys_vinuesa.fastaed |cut -d' ' -f2,3 |sort |uniq |sed 's/\[//'
# generate a "fastab" formatted fasta file using this 1liner
perl -pe 'unless(/^>/){s/\n//g}; if(/>/){s/\n/\t/g}; s/>/\n>/' rpoB_Bradys_vinuesa.fastaed > rpoB_Bradys_vinuesa.fastaedtab
# generate species-specific fasta files
for tax in $(grep '>' rpoB_Bradys_vinuesa.fastaed |cut -d' ' -f3 |sort |uniq |sed 's/\[//; s/\.//'); do grep $tax rpoB_Bradys_vinuesa.fastaedtab > Bradyrhizobium_${tax}_rpoB_vinuesa.fnatab; done
# restore the original fasta format back, running this "fas2tab 1liler"
for file in *fnatab; do perl -pe 's/\t/\n/' $file > ${file%tab}; done
# finally remove the intermediary *tab files
rm *tab
没有评论:
发表评论