1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/usr/bin/python
# The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
#
# This is an example illustrating the use of the structural SVM solver from the dlib C++
# Library. Therefore, this example teaches you the central ideas needed to setup a
# structural SVM model for your machine learning problems. To illustrate the process, we
# use dlib's structural SVM solver to learn the parameters of a simple multi-class
# classifier. We first discuss the multi-class classifier model and then walk through
# using the structural SVM tools to find the parameters of this classification model.
#
# As an aside, dlib's C++ interface to the structural SVM solver is threaded. So on a
# multi-core computer it is significantly faster than using the python interface. So
# consider using the C++ interface instead if you find that running it in python is slow.
#
# COMPILING THE DLIB PYTHON INTERFACE
# Dlib comes with a compiled python interface for python 2.7 on MS Windows. If
# you are using another python version or operating system then you need to
# compile the dlib python interface before you can use this file. To do this,
# run compile_dlib_python_module.bat. This should work on any operating system
# so long as you have CMake and boost-python installed. On Ubuntu, this can be
# done easily by running the command: sudo apt-get install libboost-python-dev cmake
import dlib
def main():
# In this example, we have three types of samples: class 0, 1, or 2. That is, each of
# our sample vectors falls into one of three classes. To keep this example very
# simple, each sample vector is zero everywhere except at one place. The non-zero
# dimension of each vector determines the class of the vector. So for example, the
# first element of samples has a class of 1 because samples[0][1] is the only non-zero
# element of samples[0].
samples = [[0,2,0], [1,0,0], [0,4,0], [0,0,3]];
# Since we want to use a machine learning method to learn a 3-class classifier we need
# to record the labels of our samples. Here samples[i] has a class label of labels[i].
labels = [1,0,1,2]
# Now that we have some training data we can tell the structural SVM to learn the
# parameters of our 3-class classifier model. The details of this will be explained
# later. For now, just note that it finds the weights (i.e. a vector of real valued
# parameters) such that predict_label(weights, sample) always returns the correct label
# for a sample vector.
problem = three_class_classifier_problem(samples, labels)
weights = dlib.solve_structural_svm_problem(problem)
# Print the weights and then evaluate predict_label() on each of our training samples.
# Note that the correct label is predicted for each sample.
print weights
for i in range(len(samples)):
print "predicted label for sample[{0}]: {1}".format(i, predict_label(weights, samples[i]))
def predict_label(weights, sample):
"""Given the 9-dimensional weight vector which defines a 3 class classifier, predict the
class of the given 3-dimensional sample vector. Therefore, the output of this
function is either 0, 1, or 2 (i.e. one of the three possible labels)."""
# Our 3-class classifier model can be thought of as containing 3 separate linear
# classifiers. So to predict the class of a sample vector we evaluate each of these
# three classifiers and then whatever classifier has the largest output "wins" and
# predicts the label of the sample. This is the popular one-vs-all multi-class
# classifier model.
#
# Keeping this in mind, the code below simply pulls the three separate weight vectors
# out of weights and then evaluates each against sample. The individual classifier
# scores are stored in scores and the highest scoring index is returned as the label.
w0 = weights[0:3]
w1 = weights[3:6]
w2 = weights[6:9]
scores = [dot(w0, sample), dot(w1,sample), dot(w2, sample)]
max_scoring_label = scores.index(max(scores))
return max_scoring_label
def dot(a, b):
"Compute the dot product between the two vectors a and b."
return sum(i*j for i,j in zip(a,b))
###########################################################################################
class three_class_classifier_problem:
# Now we arrive at the meat of this example program. To use the
# dlib.solve_structural_svm_problem() routine you need to define an object which tells
# the structural SVM solver what to do for your problem. In this example, this is done
# by defining the three_class_classifier_problem object. Before we get into the
# details, we first discuss some background information on structural SVMs.
#
# A structural SVM is a supervised machine learning method for learning to predict
# complex outputs. This is contrasted with a binary classifier which makes only simple
# yes/no predictions. A structural SVM, on the other hand, can learn to predict
# complex outputs such as entire parse trees or DNA sequence alignments. To do this,
# it learns a function F(x,y) which measures how well a particular data sample x
# matches a label y, where a label is potentially a complex thing like a parse tree.
# However, to keep this example program simple we use only a 3 category label output.
#
# At test time, the best label for a new x is given by the y which maximizes F(x,y).
# To put this into the context of the current example, F(x,y) computes the score for a
# given sample and class label. The predicted class label is therefore whatever value
# of y which makes F(x,y) the biggest. This is exactly what predict_label() does.
# That is, it computes F(x,0), F(x,1), and F(x,2) and then reports which label has the
# biggest value.
#
# At a high level, a structural SVM can be thought of as searching the parameter space
# of F(x,y) for the set of parameters that make the following inequality true as often
# as possible:
# F(x_i,y_i) > max{over all incorrect labels of x_i} F(x_i, y_incorrect)
# That is, it seeks to find the parameter vector such that F(x,y) always gives the
# highest score to the correct output. To define the structural SVM optimization
# problem precisely, we first introduce some notation:
# - let PSI(x,y) == the joint feature vector for input x and a label y.
# - let F(x,y|w) == dot(w,PSI(x,y)).
# (we use the | notation to emphasize that F() has the parameter vector of
# weights called w)
# - let LOSS(idx,y) == the loss incurred for predicting that the idx-th training
# sample has a label of y. Note that LOSS() should always be >= 0 and should
# become exactly 0 when y is the correct label for the idx-th sample. Moreover,
# it should notionally indicate how bad it is to predict y for the idx'th sample.
# - let x_i == the i-th training sample.
# - let y_i == the correct label for the i-th training sample.
# - The number of data samples is N.
#
# Then the optimization problem solved by a structural SVM using
# dlib.solve_structural_svm_problem() is the following:
# Minimize: h(w) == 0.5*dot(w,w) + C*R(w)
#
# Where R(w) == sum from i=1 to N: 1/N * sample_risk(i,w)
# and sample_risk(i,w) == max over all Y: LOSS(i,Y) + F(x_i,Y|w) - F(x_i,y_i|w)
# and C > 0
#
# You can think of the sample_risk(i,w) as measuring the degree of error you would make
# when predicting the label of the i-th sample using parameters w. That is, it is zero
# only when the correct label would be predicted and grows larger the more "wrong" the
# predicted output becomes. Therefore, the objective function is minimizing a balance
# between making the weights small (typically this reduces overfitting) and fitting the
# training data. The degree to which you try to fit the data is controlled by the C
# parameter.
#
# For a more detailed introduction to structured support vector machines you should
# consult the following paper:
# Predicting Structured Objects with Support Vector Machines by
# Thorsten Joachims, Thomas Hofmann, Yisong Yue, and Chun-nam Yu
#
# Finally, we come back to the code. To use dlib.solve_structural_svm_problem() you
# need to provide the things discussed above. This is the value of C, the number of
# training samples, the dimensionality of PSI(), as well as methods for calculating the
# loss values and PSI() vectors. You will also need to write code that can compute:
# max over all Y: LOSS(i,Y) + F(x_i,Y|w). To summarize, the
# three_class_classifier_problem class is required to have the following fields:
# - C
# - num_samples
# - num_dimensions
# - get_truth_joint_feature_vector()
# - separation_oracle()
C = 1
# There are also a number of optional arguments:
# epsilon is the stopping tolerance. The optimizer will run until R(w) is within
# epsilon of its optimal value. If you don't set this then it defaults to 0.001.
#epsilon = 1e-13
# Uncomment this and the optimizer will print its progress to standard out. You will
# be able to see things like the current risk gap. The optimizer continues until the
# risk gap is below epsilon.
#be_verbose = True
# If you want to require that the learned weights are all non-negative then set this
# field to True.
#learns_nonnegative_weights = True
# The optimizer uses an internal cache to avoid unnecessary calls to your
# separation_oracle() routine. This parameter controls the size of that cache. Bigger
# values use more RAM and might make the optimizer run faster. You can also disable it
# by setting it to 0 which is good to do when your separation_oracle is very fast. If
# If you don't call this function it defaults to a value of 5.
#max_cache_size = 20
def __init__(self, samples, labels):
# dlib.solve_structural_svm_problem() expects the class to have num_samples and
# num_dimensions fields. These fields should contain the number of training
# samples and the dimensionality of the PSI feature vector respectively.
self.num_samples = len(samples)
self.num_dimensions = len(samples[0])*3
self.samples = samples
self.labels = labels
def make_psi(self, x, label):
"""Compute PSI(x,label)."""
# All we are doing here is taking x, which is a 3 dimensional sample vector in this
# example program, and putting it into one of 3 places in a 9 dimensional PSI
# vector, which we then return. So this function returns PSI(x,label). To see why
# we setup PSI like this, recall how predict_label() works. It takes in a 9
# dimensional weight vector and breaks the vector into 3 pieces. Each piece then
# defines a different classifier and we use them in a one-vs-all manner to predict
# the label. So now that we are in the structural SVM code we have to define the
# PSI vector to correspond to this usage. That is, we need to setup PSI so that
# argmax_y dot(weights,PSI(x,y)) == predict_label(weights,x). This is how we tell
# the structural SVM solver what kind of problem we are trying to solve.
#
# It's worth emphasizing that the single biggest step in using a structural SVM is
# deciding how you want to represent PSI(x,label). It is always a vector, but
# deciding what to put into it to solve your problem is often not a trivial task.
# Part of the difficulty is that you need an efficient method for finding the label
# that makes dot(w,PSI(x,label)) the biggest. Sometimes this is easy, but often
# finding the max scoring label turns into a difficult combinatorial optimization
# problem. So you need to pick a PSI that doesn't make the label maximization step
# intractable but also still well models your problem.
# Create a dense vector object (note that you can also use unsorted sparse vectors
# (i.e. dlib.sparse_vector objects) to represent your PSI vector. This is useful
# if you have very high dimensional PSI vectors that are mostly zeros. In the
# context of this example, you would simply return a dlib.sparse_vector at the end
# of make_psi() and the rest of the example would still work properly. ).
psi = dlib.vector()
# Set it to have 9 dimensions. Note that the elements of the vector are 0
# initialized.
psi.resize(self.num_dimensions)
dims = len(x)
if (label == 0):
for i in range(0,dims):
psi[i] = x[i]
elif (label == 1):
for i in range(dims,2*dims):
psi[i] = x[i-dims]
else: # the label must be 2
for i in range(2*dims,3*dims):
psi[i] = x[i-2*dims]
return psi
# Now we get to the two member functions that are directly called by
# dlib.solve_structural_svm_problem().
#
# In get_truth_joint_feature_vector(), all you have to do is return the PSI() vector
# for the idx-th training sample when it has its true label. So here it returns
# PSI(self.samples[idx], self.labels[idx]).
def get_truth_joint_feature_vector(self, idx):
return self.make_psi(self.samples[idx], self.labels[idx])
# separation_oracle() is more interesting. dlib.solve_structural_svm_problem() will
# call separation_oracle() many times during the optimization. Each time it will give
# it the current value of the parameter weights and the separation_oracle() is supposed
# to find the label that most violates the structural SVM objective function for the
# idx-th sample. Then the separation oracle reports the corresponding PSI vector and
# loss value. To state this more precisely, the separation_oracle() member function
# has the following contract:
# requires
# - 0 <= idx < self.num_samples
# - len(current_solution) == self.num_dimensions
# ensures
# - runs the separation oracle on the idx-th sample. We define this as follows:
# - let X == the idx-th training sample.
# - let PSI(X,y) == the joint feature vector for input X and an arbitrary label y.
# - let F(X,y) == dot(current_solution,PSI(X,y)).
# - let LOSS(idx,y) == the loss incurred for predicting that the idx-th sample
# has a label of y. Note that LOSS() should always be >= 0 and should
# become exactly 0 when y is the correct label for the idx-th sample.
#
# Then the separation oracle finds a Y such that:
# Y = argmax over all y: LOSS(idx,y) + F(X,y)
# (i.e. It finds the label which maximizes the above expression.)
#
# Finally, separation_oracle() returns LOSS(idx,Y),PSI(X,Y)
def separation_oracle(self, idx, current_solution):
samp = self.samples[idx]
dims = len(samp)
scores = [0,0,0]
# compute scores for each of the three classifiers
scores[0] = dot(current_solution[0:dims], samp)
scores[1] = dot(current_solution[dims:2*dims], samp)
scores[2] = dot(current_solution[2*dims:3*dims], samp)
# Add in the loss-augmentation. Recall that we maximize LOSS(idx,y) + F(X,y) in
# the separate oracle, not just F(X,y) as we normally would in predict_label().
# Therefore, we must add in this extra amount to account for the loss-augmentation.
# For our simple multi-class classifier, we incur a loss of 1 if we don't predict
# the correct label and a loss of 0 if we get the right label.
if (self.labels[idx] != 0):
scores[0] += 1
if (self.labels[idx] != 1):
scores[1] += 1
if (self.labels[idx] != 2):
scores[2] += 1
# Now figure out which classifier has the largest loss-augmented score.
max_scoring_label = scores.index(max(scores))
# And finally record the loss that was associated with that predicted label.
# Again, the loss is 1 if the label is incorrect and 0 otherwise.
if (max_scoring_label == self.labels[idx]):
loss = 0
else:
loss = 1
# Finally, return the loss and PSI vector corresponding to the label we just found.
psi = self.make_psi(samp, max_scoring_label)
return loss,psi
if __name__ == "__main__":
main()