#!/usr/bin/env python #This tool allow users to plot SVM-prob ROC curve from data from svm import * from sys import argv, platform from os import path, popen from random import randrange , seed from string import atof,atoi from time import sleep #search path for gnuplot executable #be careful on using windows LONG filename, surround it with double quotes. #and leading 'r' to make it raw string, otherwise, repeat \\. gnuplot_exe_list = [r'"C:\Program Files\gnuplot\pgnuplot.exe"', "/usr/bin/gnuplot","/usr/local/bin/gnuplot"] #read svm sparse format from file 'fname' (with labels only) #output: [attributes[], labels[]] def read_svm(fname): attrs = [] labels = [] for line in open(fname).readlines(): label, lineattr = line.split(' ', 1) attr={} for e in lineattr.split(): index,value= e.split(":") attr[int(index)] = float(value) attrs.append(attr) labels.append(float(label)) #float in case of regression return [attrs, labels] def get_pos_deci(train_x, train_y, test_x, param): prob = svm_problem(train_y, train_x) model = svm_model(prob, param) #check if model.get_labels() != [1,-1] and model.get_labels() != [-1,1]: print "ROC is only applicable to binary classes with labels 1, -1" raise SystemExit deci = [] #predict and grab decision value, assure deci>0 for label+ for i in test_x: deci.append(model.predict_values(i)[1,-1]) return deci,model #get_cv_deci(prob_x[], prob_y[], svm_parameter param, nr_fold) #input raw attributes, labels, param, cv_fold in decision value building #output list of decision value, remember to seed(0) def get_cv_deci(prob_x, prob_y, param, nr_fold): if nr_fold == 1 or nr_fold==0: deci,model = get_pos_deci(prob_x, prob_y, prob_x, param) return deci deci, model = [], [] prob_l = len(prob_y) #random permutation by swapping i and j instance for i in range(prob_l): j = randrange(i,prob_l) prob_x[i], prob_x[j] = prob_x[j], prob_x[i] prob_y[i], prob_y[j] = prob_y[j], prob_y[i] #cross training : folding for i in range(nr_fold): begin = i * prob_l / nr_fold end = (i + 1) * prob_l / nr_fold train_x = prob_x[:begin] + prob_x[end:] train_y = prob_y[:begin] + prob_y[end:] test_x = prob_x[begin:end] subdeci, submdel = get_pos_deci(train_x, train_y, test_x, param) deci += subdeci return deci #a simple gnuplot object class gnuplot: def __init__(self, term='onscreen'): # -persists leave plot window on screen after gnuplot terminates if platform == 'win32': cmdline = gnuplot_exe self.__dict__['screen_term'] = 'windows' else: cmdline = gnuplot_exe + ' -persist' self.__dict__['screen_term'] = 'x11' self.__dict__['iface'] = popen(cmdline,'w') self.set_term(term) def set_term(self, term): if term=='onscreen': self.writeln("set term %s" % self.screen_term) else: #term must be either x.ps or x.png if term.find('.ps')>0: self.writeln("set term postscript eps color 22") elif term.find('.png')>0: self.writeln("set term png"); else: print 'You must set term to either *.ps or *.png' raise SystemExit self.output = term def writeln(self,cmdline): self.iface.write(cmdline + '\n') def __setattr__(self, attr, val): if type(val) == str: self.writeln('set %s \"%s\"' % (attr, val)) else: print "Unsupport format:", attr, val raise SystemExit #terminate gnuplot def __del__(self): self.writeln("quit") self.iface.flush() self.iface.close() def __repr__(self): return "" % term #data is a list of [x,y] def plotline(self, data): self.writeln("plot \"-\" notitle with lines linewidth 1") for i in range(len(data)): self.writeln("%f %f" % (data[i][0], data[i][1])) sleep(0) #delay self.writeln("e") if platform=='win32': sleep(3) #processing argv and set some global variables def proc_argv(argv = argv): #The command line : ./plotroc -c 1 -g 0 -m 100 -v 5 -t 2 -T xxx.test xxx.train #default parameter fold = 5 cost = 2**0 kern = 2 gma = 0 cache_size = 200 test_file = None i = 1 while i < len(argv) - 1: if argv[i] == '-v': fold = int(argv[i+1]) elif argv[i] == '-g': gma = atof(argv[i+1]) elif argv[i] == '-c': cost = atof(argv[i+1]) elif argv[i] == '-t': kern = atoi(argv[i+1]) elif argv[i] == '-m': cache_size = atoi(argv[i+1]) elif argv[i] == '-T': test_file = argv[i+1] else: i+=1 continue i += 2 #skip len("-argname argval") if gma!=0: param = svm_parameter(svm_type=C_SVC, kernel_type=kern, gamma=gma, C=cost) else: param = svm_parameter(svm_type=C_SVC, kernel_type=kern, C=cost) return param, fold, argv[-1], test_file def plot_roc(deci, label, output, title): #count of postive and negative labels db = [] pos, neg = 0, 0 for i in range(len(label)): if label[i]>0: pos+=1 else: neg+=1 db.append([deci[i], label[i]]) #sorting by decision value def cmp (x,y): if x[0] < y[0]: return 1 else: return -1; db.sort(cmp) #calculate ROC xy_arr = [] tp, fp = 0., 0. #assure float division for i in range(len(db)): if db[i][1]>0: #positive tp+=1 else: fp+=1 xy_arr.append([fp/neg,tp/pos]) #area under curve aoc = 0. prev_x = 0 for x,y in xy_arr: if x != prev_x: aoc += (x - prev_x) * y prev_x = x #begin gnuplot if title == None: title = output; #also write to file g = gnuplot(output) g.xlabel = "False Positive Rate" g.ylabel = "True Positive Rate" g.title = "ROC curve of %s (AUC = %.4f)" % (title,aoc) g.plotline(xy_arr) #display on screen s = gnuplot('onscreen') s.xlabel = "False Positive Rate" s.ylabel = "True Positive Rate" s.title = "ROC curve of %s (AUC = %.4f)" % (title,aoc) s.plotline(xy_arr) def check_gnuplot_exe(): global gnuplot_exe gnuplot_exe = None for g in gnuplot_exe_list: if path.exists(g.replace('"','')): gnuplot_exe=g break if gnuplot_exe == None: print "You must add correct path of 'gnuplot' into gnuplot_exe_list" raise SystemExit def main(): check_gnuplot_exe() if len(argv) <= 1: print "Usage: %s [-t kern_type][-c Cost][-g gamma][-m cache_size][-v cv_fold][-T testing_file] training_file" % argv[0] raise SystemExit param,fold,train_file,test_file = proc_argv() output_file = path.split(train_file)[1] + '-roc.png' #read data train_x, train_y = read_svm(train_file) #get decision value, with positive = label+ seed(0) #reset random seed if test_file: #go with test_file output_title = "%s on %s" % (path.split(test_file)[1], path.split(train_file)[1]) test_x, test_y = read_svm(test_file) deci,model = get_pos_deci(train_x, train_y, test_x, param) plot_roc(deci, test_y, output_file, output_title) else: #single file -> CV output_title = path.split(train_file)[1] deci = get_cv_deci(train_x, train_y, param, fold) plot_roc(deci, train_y, output_file, output_title) if __name__ == '__main__': main()