% % The SVM.1 approach in the paper: % % David D. Lewis, Yiming Yang, Tony G. Rose, and Fan Li. % RCV1: A new benchmark collection for text categorization research. % Journal of Machine Learning Research, 5:361-397, 2004. % % We use liblinear as the classifier. % The classification algorithm can be either l2-loss, l1-loss SVM or logistic regression function rcv1_lineart_col(dataset, algorithm, swap) global algo algo = algorithm; global rcv1_x % shall we swap training/testing set? swap = 0; if nargin == 3 && strcmp(swap, 'swap') swap = 1; end disp('INFO: loading training data') load rcv1_train.mat disp('INFO: loading testing data') load rcv1_test.mat trainprefix = ''; testprefix = 't'; if swap trainprefix = 't'; testprefix = ''; tmp = rcv1_x; rcv1_x = rcv1_tx'; rcv1_tx = tmp'; clear tmp else rcv1_x = rcv1_x'; rcv1_tx = rcv1_tx'; end switch dataset case {'topics', 'regions', 'industries'} eval(sprintf('rcv1_y = rcv1_%s_%sy;', dataset, trainprefix)); eval(sprintf('rcv1_map = rcv1_%s_%smap;', dataset, trainprefix)); eval(sprintf('rcv1_ty = rcv1_%s_%sy;', dataset, testprefix)); eval(sprintf('rcv1_tmap = rcv1_%s_%smap;', dataset, testprefix)); otherwise disp('INFO: Unkown dataset') return end % eliminate instances that have duplicated label rcv1_y = full(rcv1_y); rcv1_ty = full(rcv1_ty); t0all = clock; rand('seed', 1); % fill x probk = size(rcv1_x, 1); testk = size(rcv1_tx, 1); k = max(probk, testk); rcv1_x = [rcv1_x; zeros(k-probk, size(rcv1_y, 1))]; rcv1_tx = [rcv1_tx; zeros(k-testk, size(rcv1_ty, 1))]; bias=1; rcv1_x=[rcv1_x; ones(1, size(rcv1_x,2))*bias]; rcv1_tx=[rcv1_tx; ones(1, size(rcv1_tx,2))*bias]; k = size(rcv1_x, 1); d = size(rcv1_y, 2); w = zeros(size(rcv1_x, 1), d); b = zeros(d, 1); % obtain (w,b) for each label for i = 1:d fprintf(1, 'INFO: training label %d (i %d)...\n', rcv1_map(i), i); t0 = clock; [w(:, i), b(i)] = train_one_label(2*rcv1_y(:, i)-1); fprintf(1, 'INFO: b = %g\n', b(i)); fprintf(1, 'INFO: Training Time: %.6f seconds\n', etime(clock, t0)); end fprintf(1, 'INFO: predicting...\n'); t0 = clock; fprintf(1, 'INFO: calculate measurement\n'); % we calculate measurement for 1+train(101) (the first row in the paper) F = 0; tp_sum = 0; fp_sum = 0; fn_sum = 0; tn_sum = 0; for i = 1:d % deal with this label label = rcv1_map(i); test_idx = find(rcv1_tmap == label); if isempty(test_idx) orig = zeros(size(rcv1_ty, 1), 1); else orig = rcv1_ty(:, test_idx); end pre = ((w(:,i)'*rcv1_tx)' + b(i)) > 0; tp = full(sum(orig == +1 & pre == +1)); fn = full(sum(orig == +1 & pre == 0)); tn = full(sum(orig == 0 & pre == 0)); fp = full(sum(orig == 0 & pre == +1)); this_F = 0; if tp ~= 0 || fp ~= 0 || fn ~= 0 this_F = (2*tp) / (2*tp + fp + fn); end F = F + this_F; tp_sum = tp_sum + tp; fp_sum = fp_sum + fp; fn_sum = fn_sum + fn; tn_sum = tn_sum + tn; fprintf(1, 'INFO: label %d:\tF %.6f\ttp %d\tfp %d\ttn %d\tfn %d\n', label, this_F, tp, fp, tn, fn); end micro = (2*tp_sum) / (2*tp_sum + fp_sum + fn_sum); macro = F / d; fprintf(1, 'INFO: tp_sum %d fp_sum %d fn_sum %d tn_sum %d\n', tp_sum, fp_sum, fn_sum, tn_sum); fprintf(1, 'INFO: microaverage: %.6f\n', micro); fprintf(1, 'INFO: macroaverage: %.6f\n', macro); fprintf(1, 'INFO: Total Time: %.6f seconds\n', etime(clock, t0all)); % % train_one_label % function [w, b] = train_one_label(proby) global rcv1_x fbr_list = [0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8]; nr_fold = 3; l = size(proby, 1); perm = randperm(l)'; f_list = zeros(size(fbr_list)); % cv for fold = 1:nr_fold train_id = [1:floor((fold-1)*l/nr_fold) floor(fold*l/nr_fold)+1:l]'; valid_id = [floor((fold-1)*l/nr_fold)+1:floor(fold*l/nr_fold)]'; y = proby(perm(train_id)); validy = proby(perm(valid_id)); % scutfbr [scutfbr_w, scutfbr_b_list] = scutfbr(y, rcv1_x(:, perm(train_id)), fbr_list); wTx = (scutfbr_w'*rcv1_x(:, perm(valid_id)))'; % +b for i = 1:size(fbr_list, 2) F = fmeasure(validy, 2*(wTx > -scutfbr_b_list(i))-1); f_list(i) = f_list(i) + F; end end best_fbr = fbr_list(find(f_list == max(f_list), 1, 'last')); if max(f_list) == 0 best_fbr = min(fbr_list); fprintf(1, 'INFO: train_one_label: F all 0\n'); end % final model [w, b] = scutfbr(proby, rcv1_x, best_fbr); fprintf(1, 'INFO: train_one_label: best_fbr %.1f\n', best_fbr); % % scutfbr.1 % function [w, b_list] = scutfbr(proby, probx, fbr_list) b_list = zeros(size(fbr_list, 2)); nr_fold = 3; l = size(proby, 1); perm = randperm(l)'; % cv for fold = 1:nr_fold train_id = [1:floor((fold-1)*l/nr_fold) floor(fold*l/nr_fold)+1:l]'; valid_id = [floor((fold-1)*l/nr_fold)+1:floor(fold*l/nr_fold)]'; y = proby(perm(train_id)); validy = proby(perm(valid_id)); % scut [scut_w, scut_b] = do_train(y, probx(:, perm(train_id))); wTx = (scut_w'*probx(:, perm(valid_id)))'; start_F = fmeasure(validy, 2*(wTx > -scut_b)-1); [sorted_wTx, sorted_wTx_index] = sort(wTx, 1, 'ascend'); tp = sum(validy == 1); fp = size(validy,1)-tp; fn = 0; cut = -1; best_F = (2*tp) / (2*tp + fp + fn); for i = 1:size(validy, 1) if validy(sorted_wTx_index(i)) == -1, fp = fp -1; else tp = tp -1; fn = fn + 1; end F = (2*tp) / (2*tp + fp + fn); if F >= best_F best_F = F; cut = i; end end % modify b if best_F > start_F if cut == -1 % i.e., all +1 scut_b = - (sorted_wTx(1)-eps); % predict all +1 elseif cut == size(validy, 1) scut_b = - (sorted_wTx(size(validy, 1))+eps); else scut_b = - (sorted_wTx(cut) + sorted_wTx(cut+1))/2; end end F = fmeasure(validy, 2*(wTx > -scut_b)-1); % compare to fbr_list for i = 1:size(fbr_list, 2) if F > fbr_list(i) b_list(i) = b_list(i) + scut_b; else b_list(i) = b_list(i) - max(wTx); end end end % final model b_list = b_list / nr_fold; [w, junk] = do_train(proby, probx); % % train % function [w, b] = do_train(y, x); global algo switch algo case 'lr' cmd = ['-s 0 -c 20 -B -1 -e 0.5']; case 'l2svm_dual' cmd = ['-s 1 -c 1 -B -1 -e 0.5 ']; case 'l2svm' cmd = ['-s 2 -c 1 -B -1 -e 0.5 ']; case 'l1svm_dual' cmd = ['-s 3 -c 1 -B -1 -e 0.5 ']; otherwise disp('INFO: Unkown option') end model = train(y, x, cmd, 'col'); w=model.w'; % n by nr_class b=0; if model.Label(1) == -1 w = -w; b = -b; end % % Calculate F_{1.0) measure for two-class problem % classes are +1, -1 % function F = fmeasure(original, predict) F = 0; tp = full(sum(original == +1 & predict == +1)); fn = full(sum(original == +1 & predict == -1)); fp = full(sum(original == -1 & predict == +1)); if tp ~= 0 || fp ~= 0 || fn ~= 0 F = (2*tp) / (2*tp + fp + fn); end