1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#[allow(unused_import_braces)]
pub use self::sgd::{Momentum};
pub mod sgd;
use co::{IBackend, MemoryType, SharedTensor};
use conn::NN;
use solver::*;
use layer::*;
use util::*;
trait SGDSolver<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32>> : ISolver<SolverB, NetB> {
fn compute_update_value(&mut self,
config: &SolverConfig,
weight_blob: &ArcLock<SharedTensor<f32>>,
history_blob_id: usize,
global_lr: &f32,
blob_lr: &f32);
#[allow(unused_must_use)]
fn clip_gradients<B: IBackend + LayerOps<f32> + 'static>(&self, config: &SolverConfig, net: &mut Layer<B>) {
if let Some(clip_threshold) = config.clip_gradients {
let native = native_backend();
let net_gradients = net.learnable_weights_gradients();
let mut sumsq_diff = 0f32;
let backend = self.backend();
for net_gradient in net_gradients.clone() {
let gradient = net_gradient.read().unwrap();
let mut result = SharedTensor::<f32>::new(IBackend::device(backend), &1).unwrap();
self.backend().dot_plain(&gradient, &gradient, &mut result);
let mut result = SharedTensor::<f32>::new(IBackend::device(backend), &1).unwrap();
match result.add_device(native.device()) { _ => result.sync(native.device()).unwrap() }
match result.get(native.device()).unwrap() {
&MemoryType::Native(ref sumsq_result) => {
let sumsq_diff_slice = sumsq_result.as_slice::<f32>();
sumsq_diff += sumsq_diff_slice[0];
},
#[cfg(any(feature = "opencl", feature = "cuda"))]
_ => {}
}
}
let l2norm_diff = sumsq_diff.sqrt();
if l2norm_diff > clip_threshold {
let scale_factor = clip_threshold / l2norm_diff;
info!("Gradient clipping: scaling down gradients (L2 norm {} > {})
by scale factor {}",
l2norm_diff,
clip_threshold,
scale_factor);
let mut scale_shared = native_scalar(scale_factor);
for weight_gradient in net_gradients {
let mut gradient = weight_gradient.write().unwrap();
backend.scal(&mut scale_shared, &mut gradient);
}
}
}
}
fn normalize(&self, config: &SolverConfig, weight_blob: &ArcLock<SharedTensor<f32>>) {
if config.minibatch_size > 1 {
let scale_factor = 1f32 / config.minibatch_size as f32;
let mut gradient = weight_blob.write().unwrap();
let native = native_backend();
let mut scale_factor_shared = native_scalar(scale_factor);
self.backend().scal(&mut scale_factor_shared, &mut gradient).unwrap();
}
}
fn regularize(&self, config: &SolverConfig, weight_gradient: &ArcLock<SharedTensor<f32>>, blob_weight_decay: Option<f32>) {
if let Some(global_weight_decay) = config.weight_decay {
if let Some(regularization_method) = config.regularization_method {
match blob_weight_decay {
Some(weight_decay_mult) => {
let local_decay = global_weight_decay * weight_decay_mult;
match regularization_method {
RegularizationMethod::L2 => {
let native = native_backend();
let decay_shared = native_scalar(local_decay);
let gradient = &mut weight_gradient.write().unwrap();
unimplemented!();
}
}
}
None => {
error!("Weight decay multiplier for gradient missing.");
}
}
}
}
}
}