Introductory Python - download IDLE and have a tinker

#Originally written for biologists so be careful, you might learn some genetics too

#All prints need a () to work if you're using Python 3.5 as this was written in Python 2.7
----------------------------------------------------------------------------------------------------
# function and method calls
name = "Hello world"      
length = len(name)       # function call
new_name = name.upper()  # method call
print new_name, length
----------------------------------------------------------------------------------------------------
# sets
s1 = set([1,6,2,2,4,5])
s2 = {3,1,2}

print s1
print s2
print 4 in s1
print s2.intersection(s1)
print set('MRVLLVALALLALAASATS')

----------------------------------------------------------------------------------------------------
# lists
numbers = [4,2,2,7]
3motifs = ['gaattc','ggatcc','aagctt']
matrix = [[1,2,3], 
          [4,5,6]]

print numbers[1]
print motifs[0:2]
print matrix[0]
print matrix[0][0]

----------------------------------------------------------------------------------------------------
# tuples
feature = ('TRANSMEM',301,318)
point = (1.0, 2.0)

print feature
print 'type: ',feature[0]
print 'start:',feature[1]
print 'end:  ',feature[2]

----------------------------------------------------------------------------------------------------
# dictionaries
aa2code = {'A':'Ala','C':'Cys','E':'Glu'}

print aa2code['C']
print aa2code.items()
print aa2code.values()
print aa2code.keys()

----------------------------------------------------------------------------------------------------
# variables and string concatentation
first_name = "Liam"
surname = "McIntyre"
name = first_name+' '+surname
print name
----------------------------------------------------------------------------------------------------
# variables and numerical operations - int
length = 5      
width = 10
area = length*width
print area
----------------------------------------------------------------------------------------------------
# type conversion
print 1+2
print '1'+'2'
print '1'*2
print int('1')+2
print str(1)+'2'
print 1/2
print float(1)/2
print 1.0/2
----------------------------------------------------------------------------------------------------
# help functions
dir(list)
help(list)
help(list.append)
help(len)

----------------------------------------------------------------------------------------------------
# data types
bool     = True      
int      = 42        
long     = 42L       
float    = 0.1       
hex      = 0xFF      
string   = 'text'
string   = "text"    
complex  = 1+3j      
function = lambda a,b: a+b

print type(0.1)
print type(1 > 2*3) 

----------------------------------------------------------------------------------------------------
# normalize protein identifiers
ids = "yhr041c-I; YPR018W; YKL182D-I; ykl193c; YOR103C"
ids = ids.upper().replace('-I','').split('; ')
print ids
print ', '.join(ids)

----------------------------------------------------------------------------------------------------
# filter protein ids
ids = "YHR041C YPR018W YKL182D YKL193C YOR103C".split()
print [id for id in ids if id.startswith('YK')]

----------------------------------------------------------------------------------------------------
# create list from a string of protein ids
ids1 = "YHR041C YPR018W YKL182D YKL193C YOR103C"
ids2 = "YHR041C, YPR018W, YKL182D, YKL193C, YOR103C"
print ids1.split()
print ids2.split(', ')

----------------------------------------------------------------------------------------------------
# what are the duplicates?
ids = [1,2,4,5,2,6,7]
print set(id for id in ids if ids.count(id)>1)

----------------------------------------------------------------------------------------------------
# are there duplicate protein ids in the list?
ids = [1,2,4,5,2,6,7]
print len(ids) != len(set(ids))

----------------------------------------------------------------------------------------------------
# working with lists of protein ids
ids1 = [1,2,4,5,2,6,7]
ids2 = [4,3,2,8]

print "shared  :", set(ids1).intersection(ids2)
print "combined:", set(ids1).union(ids2)
print "missing :", set(ids1).difference(ids2)

----------------------------------------------------------------------------------------------------
# some math on data
numbers = [1, 5.1, 3.2, 345, 67]
print "len  = ", len(numbers)
print "sum  = ", sum(numbers)
print "mean = ", sum(numbers)/len(numbers)
print "max  = ", max(numbers)
print "sort = ", sorted(numbers)

# sum over numbers in a file
s = 0
for line in open('data/numbers.txt'):
    s += int(line.strip())
print s    
----------------------------------------------------------------------------------------------------
# find shared protein ids in two files
data1 = open('data/protein_ids1.txt').read()
data2 = open('data/protein_ids2.txt').read()
ids1, ids2 = data1.split(), data2.split()
shared_ids = set(ids1).intersection(ids2)
fout = open('data/protein_ids3.txt','w')
for id in shared_ids:
    fout.write(id+'\n')
fout.close()    
print "done"    

----------------------------------------------------------------------------------------------------
# nucleotide distribution in one line
seq = 'ACAAGATGCCATTGTCCCCCG'
print { n:seq.count(n) for n in set(seq) }

----------------------------------------------------------------------------------------------------
# loop over number range
for i in xrange(1,10,2): 
    print i,
----------------------------------------------------------------------------------------------------
# shared protein ids
ids1 = "YHR045D YPR018W YKL182D YKL193C YOR103C"
ids2 = "YHR041C, YPR018W, YKL193C"

ids1 = ids1.split()
ids2 = ids2.split(', ')
print ids1
print ids2

ids_set1 = set(ids1)
ids_set2 = set(ids2)
print "shared  :", ids_set1.intersection(ids_set2)

----------------------------------------------------------------------------------------------------
feature = ('TRANSMEM',301,318)
point = (1.0, 2.0)

print feature
print 'type: ',feature[0]
print 'start:',feature[1]
print 'end:  ',feature[2]

----------------------------------------------------------------------------------------------------
aa2code = {'A':'Ala','C':'Cys','E':'Glu'}

print aa2code['C']
print aa2code.items()
print aa2code.values()
print aa2code.keys()

squares= { i:i*i for i in range(10) }
print squares[4]

----------------------------------------------------------------------------------------------------
s1 = set([1,6,2,2,4,5])
s2 = {3,1,2}

print 4 in s1  # fast
#print s1[2]  # can't do
print s2.intersection(s1)


----------------------------------------------------------------------------------------------------
numbers = [4,2,2,7]
motifs = ['gaattc','ggatcc','aagctt']

print numbers[1]   # fast
print 2 in numbers # slow


----------------------------------------------------------------------------------------------------
# annotate gene list
fin = open("data/AC_000091.ptt")
fin.readline(); fin.readline(); fin.readline()
gene2anno = dict()
for line in fin:
    elems = line.strip().split('\t')
    gene, cogs, product = elems[4],elems[7],elems[8]
    gene2anno[gene] = cogs+'\t'+product
fin.close()
#print gene2anno

fin = open("data/AC_000091.genes")
fout = open("data/AC_000091.anno", 'w')
for gene in fin.read().split():
    fout.write(gene+'\t'+gene2anno[gene]+'\n')    
fout.close()
fin.close()
print "done."
----------------------------------------------------------------------------------------------------
# calculate some stats over gene lengths
f = open("data/AC_000091.ptt")
f.readline(); f.readline(); f.readline()
lengths = []
for line in f:
    elems = line.split('\t')
    length = int(elems[2])
    lengths.append(length)
print "min = ", min(lengths)
print "max = ", max(lengths)
print "avg = ", sum(lengths)/len(lengths) 

from pylab import hist, show   
hist(lengths); show()   

----------------------------------------------------------------------------------------------------
# filter for genes shorter than 100 aa
f = open("data/AC_000091.ptt")
f.readline(); f.readline(); f.readline()
for line in f:  
    elems = line.split('\t')
    length = int(elems[2])
    if length < 100:
        print elems[4], elems[2]
f.close()

----------------------------------------------------------------------------------------------------
#print first 5 lines
f = open("data/AC_000091.ptt")
for i in xrange(5):
    line = f.readline()
    print line.split('\t')

----------------------------------------------------------------------------------------------------
# show lines of file
for line in open('data/numbers.txt'):
    print line,
    #print line.strip()  
----------------------------------------------------------------------------------------------------
# nucleotide distribution
seq = 'ACAAGATGCCATTGTCCCCCG'
freq = {}
for ch in seq:
    freq[ch] = freq.get(ch,0) + 1
print freq

----------------------------------------------------------------------------------------------------
# codon distribution
seq = 'ACAACATGCCATCATCCCACA'
codon2freq = dict()
for i in xrange(0,len(seq),3):
    codon = seq[i:i+3]
    codon2freq[codon] = codon2freq.get(codon,0) + 1
    
for codon, freq in codon2freq.items():
    print codon, freq
----------------------------------------------------------------------------------------------------
# write a file
f = open(filepath, 'w')

f.write(string)
f.writelines(list)
f.close()
  
----------------------------------------------------------------------------------------------------
# read a file
f = open(filepath)
f = open(filepath,'r')

f.read()
f.readlines()
f.readline()
f.close()
  
----------------------------------------------------------------------------------------------------
# skipping and breaking in loops
for i in xrange(10): 
    if i > 5: break
    if i < 2: continue
    print i,
    
----------------------------------------------------------------------------------------------------
# loop over string characters
for ch in "mystring": 
    print ch,
----------------------------------------------------------------------------------------------------
# loop over tuple or list elements
for e in ("red","green","blue"): 
    print e,
----------------------------------------------------------------------------------------------------
# syntax of for loop
for element in sequence:
    # indentation has meaning
    statement
----------------------------------------------------------------------------------------------------
# extending lists and tuples
nums = [1,2,3]
nums.append(4) # not for tuple
print nums
print [5,6] + nums
print (1,2,3) + (4,5)
#print (1,2,3) + [4,5]  # can't do
print (1,2,3) + tuple([4,5])

----------------------------------------------------------------------------------------------------
# tuple/list unpacking
start, end, kind = (121,256,'TRANSMEM')
print start, end, kind

(a,b,c) = (1,2,3)
(a,b,c) =  1,2,3
a,b,c  = 1,2,3
a,b,c  = [1,2,3]
print a,b,c

numbers = [1,2,3,4] 
second,third = numbers[1:3]
print second,third

----------------------------------------------------------------------------------------------------
# list/tuple slicing
# slice[start:end:stride]
nums = [1,2,3,4,5] # list or tuple
print nums[0]      # zero based index
print nums[-1]     # last element
print nums[:-1]    # up to last
print nums[-2:]    # last two elements
print nums[0:2]    # first two elements
print nums[::2]    # every 2nd element
print nums[1::2]   # from 2nd every 2nd
print nums[::-1]   # reversed
print nums[:]      # copy
----------------------------------------------------------------------------------------------------

# syntax of if conditional
if boolean_expression:
    # indentation has meaning
    statement
----------------------------------------------------------------------------------------------------
# safe square root
from math import sqrt
a = -1.0
sqrt_a = sqrt(a) if a >= 0 else 0.0
print sqrt_a
#print sqrt(a)


----------------------------------------------------------------------------------------------------
# if-elif expression
name = 'arthur'
if name == 'Liam':
    print "it's me"
elif name == 'arthur': 
    print "it's arthur"
elif name == 'john': 
    print "it's john"
else: 
    print "don't know who it is"

----------------------------------------------------------------------------------------------------
# simple boolean comparisons
print 1 == 2
print 1 != 2
print 1 < 2
print 1 > 2
print 1 <= 2
print 1 >= 2
print 'alpha' < 'beta'
print 1 < 'alpha' # don't do this
----------------------------------------------------------------------------------------------------
# safe division
a = 1.0
b = 0.0
ratio = a/b if b > 0 else 0.0
print ratio
#print a/b

----------------------------------------------------------------------------------------------------
# if-elif expression
x = 4
if   1 < x <= 3:  print "lower range"
elif 3 < x <= 5:  print "middle range"
elif 5 < x <= 10: print "upper range"    
else:             print "out of range"

----------------------------------------------------------------------------------------------------
# conditional expression
result = value_true if boolean_expr else value_false
----------------------------------------------------------------------------------------------------
# complex boolean expressions
print 1 < 2 and 2 < 3
print 1 < 2 < 3
print not 1 > 2
print not 1 > 2 or 1 < 3
----------------------------------------------------------------------------------------------------
# if expression
name = 'Liam'
if name == 'Liam': 
    print "it's me"

----------------------------------------------------------------------------------------------------
# boolean comparisions of structures
print (1, 2, 3) < (1, 2, 4) 
print [1, 2, 3] < [1, 2, 4] 
print 'C' < 'Pascal' < 'Perl' <'Python'
print (1, 2, 3, 4) < (1, 2, 4) 
print (1, 2) < (1, 2, -1) 
print (1, 2, 3) == (1.0, 2.0, 3.0) 
print (1, 2, ('aa', 'ab')) < (1, 2, ('abc', 'a'), 4) 

----------------------------------------------------------------------------------------------------
# if-else expression
name = 'arthur'
if name == 'Liam': 
    print "it's me"
else:
    print "it's not me"

----------------------------------------------------------------------------------------------------
# print first 5 lines and split line elements
f = open("data/AC_000091.ptt")
for i in xrange(5):
    line = f.readline()
    print line.split('\t')

----------------------------------------------------------------------------------------------------
# read file as one string
f = open('data/numbers.txt')
print f.read() 
----------------------------------------------------------------------------------------------------
# read a file
f = open(filepath)
f = open(filepath,'r')

f.read()
f.readlines()
f.readline()
f.close()
  
----------------------------------------------------------------------------------------------------
# write a file
f = open(filepath, 'w')

f.write(string)
f.writelines(list)
f.close()
  
----------------------------------------------------------------------------------------------------
# filter for genes shorter than 100 aa
f = open("data/AC_000091.ptt")
f.readline(); f.readline(); f.readline()
for line in f:  
    elems = line.split('\t')
    length = int(elems[2])
    if length < 100:
        print elems[4], elems[2]
f.close()

----------------------------------------------------------------------------------------------------
# calculate some stats over gene lengths
f = open("data/AC_000091.ptt")
f.readline(); f.readline(); f.readline()
lengths = []
for line in f:
    elems = line.split('\t')
    length = int(elems[2])
    lengths.append(length)
print "min = ", min(lengths)
print "max = ", max(lengths)
print "avg = ", sum(lengths)/len(lengths) 

#from pylab import hist, show   
#hist(lengths); show()   

----------------------------------------------------------------------------------------------------
# annotate gene list
fin = open("data/AC_000091.ptt")
fin.readline(); fin.readline(); fin.readline()
gene2anno = dict()
for line in fin:
    elems = line.strip().split('\t')
    gene, cogs, product = elems[4],elems[7],elems[8]
    gene2anno[gene] = cogs+'\t'+product
fin.close()
#print gene2anno

fin = open("data/AC_000091.genes")
fout = open("data/AC_000091.anno", 'w')
for gene in fin.read().split():
    fout.write(gene+'\t'+gene2anno[gene]+'\n')    
fout.close()
fin.close()
print "done."
----------------------------------------------------------------------------------------------------
# iterate over lines in a file
for line in open('data/numbers.txt'):
    print line.strip()  
----------------------------------------------------------------------------------------------------
# filter protein ids within a file
fin  = open('data/protein_ids2.txt')
fout = open('data/protein_ids3.txt','w')
for line in fin:
    id = line.strip()
    if id.startswith('YO'):  
        fout.write(id+'\n')
        #fout.flush()
fout.close()
fin.close()   
print "done"    
----------------------------------------------------------------------------------------------------
# read lines of file as a list
f = open('data/numbers.txt')
print f.readlines()  
----------------------------------------------------------------------------------------------------
# read lines of file as a list
print list(open('data/numbers.txt')) 
----------------------------------------------------------------------------------------------------
# read first three lines of a small file
print list(open('data/numbers.txt'))[:3]  
----------------------------------------------------------------------------------------------------
# read first three lines of a large file
f = open('data/numbers.txt')
for i in range(3):
    print f.readline(),  
----------------------------------------------------------------------------------------------------

# transribe DNA
def transcribe(dna):
    """dna -- DNA string
       returns RNA string"""    
    return dna.replace('T', 'U')

print transcribe('ACAAGATGCCATTGTCCCCCGG')    
----------------------------------------------------------------------------------------------------
# elegant but inefficient method to test for overlap
def overlap(start1,end1, start2,end2):
    """returns true if two regions overlap"""    
    region1 = set(range(start1,end1+1))
    region2 = set(range(start2,end2+1))
    return len(region1 & region2) > 0
    

print overlap(1,10, 2,12)
print overlap(1,10, 11,20)
----------------------------------------------------------------------------------------------------
# calculate mean value
def mean(xs):
    """xs -- list of numbers
       returns the mean value"""
    return sum(xs)/float(len(xs))
    
print mean([1,2,3,4])
print mean(range(1,5)) 
----------------------------------------------------------------------------------------------------
# test if two genomic regions overlap
def overlap(start1,end1, start2,end2):
    """returns true if two regions overlap"""    
    if start1 <= start2 and end1 <= end2 and end1 >= start2: return True
    if start1 >= start2 and end1 >= end2 and start1 <= end2: return True
    if start1 <= start2 and end1 >= end2: return True
    if start1 >= start2 and end1 <= end2: return True
    return False
    

print overlap(1,10, 2,12)
print overlap(1,10, 11,20)
----------------------------------------------------------------------------------------------------
# scope of a variable
text = "outer"
print text

def output(text):
    print text    
    
output("inner") 
print text
----------------------------------------------------------------------------------------------------
# calculate GC content
def gc(dna):
    """returns the gc content of the dna string"""
    return 100.0 * (dna.count('G')+dna.count('C')) / len(dna)

dna = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGC"
print "gc = %.2f %%" % gc(dna)    
----------------------------------------------------------------------------------------------------
# gene list versus gene list
genes1 = ['TP53','ST5']
genes2 = ['VHL','APC','CD95']
pairs = [(g1,g2) for g1 in genes1 for g2 in genes2]
print pairs
----------------------------------------------------------------------------------------------------
# enumerate genes
genes = ['TP53','ST5','VHL','APC']
for i,g in enumerate(genes):
    print i,g
----------------------------------------------------------------------------------------------------
# all versus all excluding symmetrical pairs
genes = ['TP53','ST5','VHL','APC']
pairs = [(g1,g2) for i,g1 in enumerate(genes) for j,g2 in enumerate(genes) if j<i]
print pairs
----------------------------------------------------------------------------------------------------
# all versus all excluding symmetrical pairs
genes = ['TP53','ST5','VHL','APC']
pairs = { frozenset([g1,g2]) for g1 in genes for g2 in genes }
print pairs
----------------------------------------------------------------------------------------------------
# sum over numbers in a file
numbers = [int(line.strip()) for line in open('data/numbers.txt')]
print sum(numbers)   
----------------------------------------------------------------------------------------------------
numbers = [1,2,3]
chars = ['a','b','c']
print zip(numbers, chars)

----------------------------------------------------------------------------------------------------
# compute logs of numbers - without list comprehension
from math import log10
numbers = [0.1, 0, 1, -1, 10, 100]
logs = []
for x in numbers:
    if x>0:
        logs.append(log10(x))       
print logs          
----------------------------------------------------------------------------------------------------
y = {x*x for x in {1,2,3}}
print y

----------------------------------------------------------------------------------------------------

from csv import reader
# read rows and cols of a CSV file
rows = list(reader(open('data/BMM+UTI89.csv','rU')))
cols = zip(*rows)
print rows[0]
print cols[0] 

----------------------------------------------------------------------------------------------------

import csv
# read and show CSV file
f = open('data/BMM+UTI89.csv','rU')
reader = csv.reader(f)
for row in reader:
    print row

----------------------------------------------------------------------------------------------------
[x for x in seq if cond]
{x for x in seq if cond}

----------------------------------------------------------------------------------------------------
def add(a,b): 
    """returns sum of a and b"""   
    return a+b
    
#print add(1,2)
#help(add) 
----------------------------------------------------------------------------------------------------
# compute complement of a DNA sequence
dna = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGC"
complement = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}
cdna = [complement[nuc] for nuc in dna]
print cdna
print "".join(cdna)

----------------------------------------------------------------------------------------------------
# compute logs of numbers - with list comprehension
from math import log10
numbers = [0.1, 0, 1, -1, 10, 100]
logs = [log10(x) for x in numbers if x>0]
print logs         
----------------------------------------------------------------------------------------------------
# Differentially regulated genes. Data by Nilesh Bokil
from csv import reader, writer
from math import log

def load_cols(filepath):
    rows = list(reader(open(filepath,'rU')))[1:]
    return zip(*rows)
    
def fold_change(x,y):
    x,y = float(x),float(y)
    return log(y/x) if x*y!=0 else 0.0
    
def fold_changes(xs,ys):
    return [fold_change(x,y) for x,y in zip(xs,ys)]
    
def q1(fc1,fc2): # down- vs up- regulated
    return max( -fc1*fc2, -fc2*fc1 ) 
       
def q2(fc1,fc2): # no- vs large- change in regulation
    def ratio(x,y): return abs(x)/abs(y) if x*y!=0 else 0.0
    return max( ratio(fc1,fc2), ratio(fc2,fc1) ) 
    
def key((id,q)): return -q   # key for sorting                    

ids,names,eids,bs2,us2,bs4,us4 = load_cols('data/BMM+UTI89.csv')
fcs2, fcs4 = fold_changes(bs2,us2), fold_changes(bs4,us4)
gene_qs = [(id, q1(fc2,fc4)) for id,fc2,fc4 in zip(ids,fcs2,fcs4)]
csvwriter = writer(open("data/BMM+UTI89_reg.csv",'wb'))
csvwriter.writerows(sorted(gene_qs, key=key)[:5])


----------------------------------------------------------------------------------------------------
# average length of genes
f = open("data/AC_000091.ptt")
f.readline(); f.readline(); f.readline()
rows = [line.split('\t') for line in f]
cols = zip(*rows)
lengths = [float(l) for l in cols[2]]
print "avg length:",sum(lengths)/len(lengths)
----------------------------------------------------------------------------------------------------
rows = [[ 1,  2 ], 
        ['a','b'],
        [ 5,  6 ]]
cols = zip(*rows)
#cols = zip( [1,2], ['a','b'], [5,6] )
print rows
print cols
----------------------------------------------------------------------------------------------------

# parse atomic data
import re
pattern = r'([A-Z]+)(\d+)(\w+)-(\w+)'
text = open('data/atomic.txt').read()
matches = re.findall(pattern,text)
for match in matches: print match


----------------------------------------------------------------------------------------------------
# find prosite pattern [LIVM](2)-x-R-L-[DE]-x(4)-R-L-E
from re import finditer
prosite = r'[LIVM]{2}.RL[DE].{4}RLE'
seq = "MANMQGLVERLERAVSRLESLSAESHRPPGNCGEVNGV"
for match in finditer(prosite,seq):
   print match.start(), match.end(), match.group() 
----------------------------------------------------------------------------------------------------
# find promoter sites
import re
motif = r'TTGACA|TTGACC|TTAACA'
seq = 'TGGCTTGACAGACATATGCATTGACCCGTAAA'
matches = re.findall(motif,seq)
print matches
----------------------------------------------------------------------------------------------------

ids = [('X1245',12), ('A4321',20), ('Y0121',42)]
print sorted(ids)
print sorted(ids, key=lambda (id,l): l)
print sorted(ids, key=lambda (id,l): -l)

----------------------------------------------------------------------------------------------------

ids = ['X1245', 'A4321', 'Y0121']
print sorted(ids)
print sorted(ids, key=lambda id: id[1:])
print sorted(ids, key=lambda id: -int(id[1:])) 
----------------------------------------------------------------------------------------------------
def add1(a,b): return a+b
add2 = lambda a,b: a+b
print add1(1,2)
print add2(1,2) 
----------------------------------------------------------------------------------------------------
# scope of a variable
text = "outer"
print text

def output(text):
    print text    
    
output("inner") 
print text
----------------------------------------------------------------------------------------------------
# find poly-A
from re import finditer
dna = "ACTGCGGAAAAACATCGAAAAAAACTTGTGNNNATCGNNACTTTG"
for match in finditer('A{3,}',dna): 
    print match.start(), match.group()

----------------------------------------------------------------------------------------------------
# examples of regular expressions
from re import findall
def find(text,pattern): print list(findall(pattern,text))
find('GO:0033644',      r'GO:(\d{7})')
find('stefan@uq.edu.au',r'(.+)@(.+)')
find('stefan@uq.edu.au',r'(.+)@.+')
find('20.09.2012',      r'(\d{2})\.(\d{2})\.(\d{4}|\d{2})') 
find('07 344-62606',    r'(\d{2})[ -](\d{3})[ -](\d{5})') 
----------------------------------------------------------------------------------------------------
# extract data from xml/html
import re
pattern = r'<(data\d)>(.+?)</data\d>'
#pattern = r'<(.+)>(.+)</\1>'   #better
data = '<data1>123</data1> <data2>45</data2>'
matches = re.findall(pattern,data)
for tag,data in matches:
    print tag,data
----------------------------------------------------------------------------------------------------
# take apart a GO annotation
import re
text = 'GO:0033644; C:cell membrane; IEA:UniProtKB-KW'
print re.split('; |:', text)
print re.findall(r'GO:(\d{7})', text)
print re.findall(r'([CBM]):(.+);', text)
print re.findall(r'IEA:(.+)', text)
print re.sub(r'(.+); .:(.+); (.+)', r'\2 (\1)', text)
 
----------------------------------------------------------------------------------------------------
# parse SSearch output
import re
pattern = r'[A-Z]{2}:(\w+) .+(..\de-\d\d)'
text = open('data/ssearch.txt').read()
matches = re.findall(pattern,text)
for match in matches: print match


----------------------------------------------------------------------------------------------------
# retrieve protein sequence from uniprot
from urllib import urlopen
f = urlopen("http://www.uniprot.org/uniprot/Q6GZX3.fasta")
print f.read()
f.close()

----------------------------------------------------------------------------------------------------
# retrieve data for a protein from uniprot
# there is better ways to do this for uniprot
import urllib,re
f = urllib.urlopen("http://www.uniprot.org/uniprot/Q6GZX3.txt")
for line in f:    
    if line.startswith('OS'): print 'Organism:', line[5:],
    if line.startswith('DR   GO;'): print 'GO:', line[8:],
    if line.startswith('FT'): print 'Feature:', line[5:],    
f.close()

----------------------------------------------------------------------------------------------------
# retrieve fasta sequence from pdb
from urllib import urlopen
f = urlopen("http://www.pdb.org/pdb/files/fasta.txt?structureIdList=1A34")
print f.read()
f.close() 
----------------------------------------------------------------------------------------------------
# retrieve data for a protein from uniprot
# there is easier ways (see above)
import urllib,re
f = urllib.urlopen("http://www.uniprot.org/uniprot/Q6GZX3")
page = f.read()
f.close()
print re.findall(r'Organism.*?<a href="/taxonomy/.*?">(.*?)</a>', page)
print re.findall(r'Sequence length</acronym></td><td>(.*?) AA.</td>', page)
print re.findall(r'<a href="/locations/.*?">(.*?)</a>', page)

----------------------------------------------------------------------------------------------------

#replace probe ids by expression values in list of lists
data = [['ENSMUST00000172812', 'IVGNm04959', 'IVGNm13150', 'IVGNm13165','IVGNm13171'], 
        ['ENSMUST00000173499', 'IVGNm13150', 'IVGNm13165'], 
        ['ENSMUST00000173523', 'IVGNm13150', 'IVGNm13165']]
exps = {'IVGNm13171': 0.1, 'IVGNm13615': 0.9, 'IVGNm13150': 0.7} 

print [[row[0]]+[exps[e] for e in row if e in exps] for row in data]
print [[row[0]]+[exps[e] if e in exps else 0.0 for e in row[1:]] for row in data]
print [[row[0], sum(exps[e] for e in row if e in exps)] for row in data]        
----------------------------------------------------------------------------------------------------
#some fun with list of lists
data = [ [1,2,3], [4,5] ]

print [row for row in data]
print [[elem for elem in row] for row in data]
print [[elem+1 for elem in row] for row in data]
print [[elem for elem in row if elem!=3] for row in data]
print [[0 if elem<3 else 1 for elem in row ] for row in data]
print [row[::-1] for row in data][::-1]
print [sum(row) for row in data]        
----------------------------------------------------------------------------------------------------
# sorting with key function
data = [(1,'c',2.0), (2,'b',3.0), (3,'a',1.0)]
 
print sorted(data)  
print sorted(data, reverse=True)     
print sorted(data, key=lambda (x,y,z): x*z)

----------------------------------------------------------------------------------------------------
# logging with variable parameter list
def log(text, *args):   
    print text, ",".join(args)
        
log("Loading:", 'text.dat', 'text2.dat')
log("Writing:", 'out.dat')


----------------------------------------------------------------------------------------------------
# formatting output
text = 'test'
pi = 3.1415
one = 1
print text
print "some text   : %s" % text
print "some text   : %10s" % text
print "some integer: %d" % one
print "some float  : %.2f" % pi
print "some float  : %10.2f" % pi
print "this is %d %% (one percent)" % one
print "several vars: %s, %d, %.1f" %(text, one, pi)

----------------------------------------------------------------------------------------------------
# printing separator lines
def separator(char = '-', n = 20):
    print char*n
    
separator()
separator(char='*')
separator(n=10)
separator(n=15,char='+') 

----------------------------------------------------------------------------------------------------
# perform redundancy reduction
# dirty version, files not closed
import os
os.chdir("tools")
process = os.popen('blastclust -i ../data/MR_GR.ffa -S 50')
process.next()  # skip header
cids = [cluster.split()[0] for cluster in process]
seqfile = open('../data/MR_GR.ffa')
seqs = seqfile.read().split('>')[1:]
id2seq = {seq.split()[0]:seq for seq in seqs}
reduced = ">"+">".join(id2seq[cid] for cid in cids)
open('../data/MR_GR_red.ffa','w').write(reduced)
print 'finished.'
----------------------------------------------------------------------------------------------------
# run blastclust, similarity 50%, capture stdout
import os
os.chdir("tools")
cmd = 'blastclust -i ../data/MR_GR.ffa -S 50'
process = os.popen(cmd)      #deprecated but simple
output = "".join(process.readlines())
process.close() 
print output   
----------------------------------------------------------------------------------------------------
# run blastclust, using subprocess, capture stdout
from os import chdir
from subprocess import Popen, PIPE
chdir("tools")
cmd = 'blastclust -i ../data/MR_GR.ffa -S 50'
process = Popen(cmd, shell=True, stdout=PIPE)
process.stdout.next()     #skip header
for cluster in process.stdout:
    print cluster.split()  
----------------------------------------------------------------------------------------------------
# run blastclust, write to file
import os
os.chdir("tools")
cmd = 'blastclust -i ../data/MR_GR.ffa -o out.txt'
os.system(cmd)

----------------------------------------------------------------------------------------------------
# heatmap of expression data
import csv
from pylab import cm,array,matshow,show
reader = csv.reader(open('data/expression.csv','rb'))
reader.next() # skip header
rows = list(reader)
mat = array([map(float,col) for col in zip(*rows)[1:]])
matshow(mat, cmap=cm.RdYlGn)
show()

----------------------------------------------------------------------------------------------------
# test null-hypothesis that the means of length distributions of
# corticoid receptor proteins is the same for the four classes,
# using Kruskal-Wallis test
from pylab import mean,array
from Bio.SeqIO import parse
from scipy.stats import kruskal
from collections import defaultdict

def load(fname):
    id2seqs = defaultdict(list)
    with open(fname) as f:
        for bs in parse(f,'fasta'):
            id2seqs[bs.id[:2]].append(bs.seq)       
    return id2seqs            
  
id2seqs = load('data/MR_GR.ffa')              

lens = [array(map(len,seqs)) for seqs in id2seqs.values()]
print 'Kruskal-Wallis p =', kruskal(*lens)[1] 

----------------------------------------------------------------------------------------------------
# data tables: similar to R data frames
from numpy import array
descriptor = {'names'  : ('gender','age','weight'), 
              'formats': ('S1',    'f4', 'f4') } 
data = array([('M', 64.0, 75.0), 
              ('F', 25.0, 60.0)], 
              dtype=descriptor)
print data['weight']
print data['gender']

----------------------------------------------------------------------------------------------------
# boxplot of expression data
import csv
from pylab import boxplot,show
reader = csv.reader(open('data/expression.csv','rb'))
reader.next() # skip header
rows = list(reader)
cols = [map(float,col) for col in zip(*rows)[1:]]
boxplot(cols)
show()

----------------------------------------------------------------------------------------------------
# test null hypothesis that gene-wise GC content of two different
# chlamydia species is the same, using Mann-Whitney test
from pylab import hist,show,xlabel
from Bio.SeqIO import parse
from Bio.SeqUtils import GC
from scipy.stats import mannwhitneyu

def load(fname): 
    with open(fname) as f:    
        return list(parse(f,'fasta'))
        
def plot_gc(fname):                
    seqs = [s.seq for s in load(fname)]
    gcs = map(GC,seqs)  
    hist(gcs, 40)
    xlabel('GC-content')
    return gcs
    
p_gcs = plot_gc('data/C_pneumoniae.fasta')
c_gcs = plot_gc('data/C_caviae.fasta')
print 'Mann-Whitney U-test p =', mannwhitneyu(p_gcs, c_gcs)[1]
show()

----------------------------------------------------------------------------------------------------
# a tiny example of matrix processing
from numpy import array
M = array([[1, 2, 3], 
           [4, 5, 6]])
print M
print M.sum()
print M.sum(axis=1)
print M>2
print M[M>2]
print M[:,0:2]*2
----------------------------------------------------------------------------------------------------
# load fasta sequences, print id and first 10 amino acids
from Bio.SeqIO import parse
with open('data/MR_GR.ffa') as f:
    for bs in parse(f,'fasta'):
        print bs.id, bs.seq[:10]+"..."
----------------------------------------------------------------------------------------------------
# networkx lib for graph plotting
# plot interaction network for selected genes
import csv
from networkx import * 
from pylab import show,figure,axis
from pylab import figure,axis,show

genes = set(['BRCA1','UBC','TP53'])
reader = csv.reader(open('data/network.csv'))
reader.next() #skip header
rows = [r for r in reader if r[0] in genes or r[1] in genes]
cols = zip(*rows)

nodes = set(cols[0]+cols[1])
edges = zip(cols[0],cols[1])
iedges = [(r[0],r[1]) for r in rows if r[2]=='inhibition']
aedges = [(r[0],r[1]) for r in rows if r[2]=='activation']

G = Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

figure(figsize=[16,20])
axis('off')
pos = spring_layout(G)
draw_networkx_edges(G, pos, edgelist=iedges, edge_color="red", width=0.5) 
draw_networkx_edges(G, pos, edgelist=aedges, edge_color="green", width=0.5)    
draw_networkx_nodes(G, pos, node_size=900, node_color="white", linewidths=0, alpha=0.8)
draw_networkx_labels(G, pos, font_color="k", font_size=10.0)
show()

----------------------------------------------------------------------------------------------------
# plotting of trigonometric functions
from pylab import *
x = linspace(0,3*pi,50)
plot(x, sin(x), '.-')
plot(x, cos(x), '.-')
legend(['sine','cosine'])
title('trigonometric functions')
savefig("pics/trigonometric.png",format="png")
show()

----------------------------------------------------------------------------------------------------
# plot GC content vs gene lengths for different chlamyida species
from pylab import mean,plot,show,legend,xlabel,ylabel
from Bio.SeqIO import parse
from Bio.SeqUtils import GC

def load(fname): 
    with open(fname) as f:    
        return list(parse(f,'fasta'))

def plot_gc(fname):
    seqs = [bs.seq for bs in load(fname)]
    ls, gcs = map(len,seqs), map(GC,seqs)
    plot(ls,gcs, '.')
    xlabel('length')
    ylabel('GC-content')

plot_gc('data/C_pneumoniae.fasta')
plot_gc('data/C_trachomatis.fasta')
plot_gc('data/C_caviae.fasta')
legend(['C. Pneumoniae','C. Trachomatis','C. Caviae'])
show()

----------------------------------------------------------------------------------------------------
# plot distribution of gene GC content for yeast
from pylab import hist,show,xlabel,title
from Bio.SeqIO import parse
from Bio.SeqUtils import GC

with open('data/yeast_genes.fasta') as f:   
    gcs = [GC(g.seq) for g in list(parse(f,'fasta'))]
hist(gcs, 40)
title('GC-content yeast genes')
xlabel('GC-content')
show()

----------------------------------------------------------------------------------------------------
# histogram of normally distributed data
from pylab import *
hist(randn(1000), 20)
show()

----------------------------------------------------------------------------------------------------