TMB Documentation  v1.9.11
start_parallel.hpp
1 // Copyright (C) 2013-2015 Kasper Kristensen
2 // License: GPL-2
3 
4 /*
5  ================================================
6  Routines depending on the openmp runtime library
7  ================================================
8 */
9 #ifdef _OPENMP
10 #ifdef WITH_LIBTMB
11 bool in_parallel();
12 size_t thread_num();
13 void start_parallel();
14 #else
15 bool in_parallel(){
16  return static_cast<bool>(omp_in_parallel());
17 }
18 size_t thread_num(){
19  return static_cast<size_t>(omp_get_thread_num());
20 }
21 void start_parallel(){
22 #ifdef CPPAD_FRAMEWORK
23  CppAD::thread_alloc::free_all();
24 #endif // CPPAD_FRAMEWORK
25  int nthreads=config.nthreads;
26  if(config.trace.parallel)
27  std::cout << "Using " << nthreads << " threads\n";
28 #ifdef CPPAD_FRAMEWORK
29  CppAD::thread_alloc::parallel_setup(nthreads,in_parallel,thread_num);
30  CppAD::parallel_ad<AD<AD<AD<double> > > >();
31  CppAD::parallel_ad<AD<AD<double> > >();
32  CppAD::parallel_ad<AD<double> >();
33  CppAD::parallel_ad<double >();
34 #endif // CPPAD_FRAMEWORK
35 }
36 #endif
37 #endif
38 
39 
40 /*
41  ================================================
42  Templates to do parallel computations
43  ================================================
44 */
45 template<class ADFunType>
46 struct sphess_t{
47  sphess_t(ADFunType* pf_,vector<int> i_,vector<int> j_){pf=pf_;i=i_;j=j_;}
48  ADFunType* pf;
49  vector<int> i;
50  vector<int> j;
51 };
52 
53 #ifdef CPPAD_FRAMEWORK
54 #define ADFUN ADFun<double>
55 #endif // CPPAD_FRAMEWORK
56 #ifdef TMBAD_FRAMEWORK
57 #define ADFUN TMBad::ADFun<TMBad::ad_aug>
58 #endif // TMBAD_FRAMEWORK
59 
61 typedef sphess_t<ADFUN > sphess;
62 
63 /*
64  Suppose we have a mapping F:R^n->R^m which may be written as F=F1+...+Fk.
65  Suppose we have tapes Fi:R^n->R^mi representing Fi with identical domain
66  but with *reduced range dimension* (because some range components of Fi
67  does not depend on any of the domain variables).
68  Based on these tape chunks construct an object behaving just like the
69  corresponding full taped version of F.
70  */
71 template <class Type>
72 struct parallelADFun : ADFUN { /* Inheritance just so that compiler wont complain about missing members */
73  typedef ADFUN Base;
74  /* Following five members must be defined by constructor.
75  Outer vectors are indexed by the chunk number.
76  E.g. for tape number i vecind[i] is a vector of numbers in the
77  interval [0,1,...,range-1] telling how to embed this tapes range
78  in the full range.
79  */
80  int ntapes;
81  vector<Base*> vecpf;
82  vector<vector<size_t> > vecind;
83  size_t domain;
84  size_t range;
85  /* Following members are optional */
87  /* row and column indices */
88  vector<int> veci;
89  vector<int> vecj;
90  /* Constructor:
91  In the case of a vector of ADFun pointers we assume that
92  they all have equal domain and range dimensions.
93  */
94  void CTOR(vector<Base*> vecpf_) {
95  size_t n=vecpf_.size();
96  ntapes=n;
97  vecpf.resize(n);
98  for(size_t i=0;i<n;i++)vecpf[i]=vecpf_[i];
99  domain=vecpf[0]->Domain();
100  range=vecpf[0]->Range();
101  vecind.resize(n);
102  for(size_t i=0;i<n;i++){
103  vecind(i).resize(range);
104  for(size_t j=0;j<range;j++){
105  vecind(i)[j]=j;
106  }
107  }
108  }
109  parallelADFun(vector<Base*> vecpf_) {
110  CTOR(vecpf_);
111  }
112  parallelADFun(const std::vector<Base> &vecf) {
113  vector<Base*> vecpf(vecf.size());
114 #ifdef _OPENMP
115 #pragma omp parallel for num_threads(config.nthreads)
116 #endif
117  for (int i=0; i<vecpf.size(); i++) {
118  vecpf[i] = new Base(vecf[i]);
119  }
120  CTOR(vecpf);
121  }
122  /* Constructor:
123  In the case of a vector of sphess pointers the range dimensions are allowed
124  to differ. Based on sparseness pattern represented by each tape we must
125  compute "vecind" (for each tape), formally by matching the individual
126  sparseness patterns in a numbering of the full sparseness pattern (the union).
127  */
128  parallelADFun(vector<sphess* > H){
129  H_=H;
130  domain=H[0]->pf->Domain();
131  int n=H.size();
132  ntapes=n;
133  vecpf.resize(n);
134  vecind.resize(n);
135  for(int i=0;i<n;i++)vecpf[i]=H[i]->pf;
136  size_t kmax=0;
137  for(int i=0;i<n;i++){
138  //vecind[i]=(vector<size_t>(H[i]->i))+(vector<size_t>(H[i]->j))*domain;
139  vecind[i]=((H[i]->i).cast<size_t>())+((H[i]->j).cast<size_t>())*domain;
140  kmax+=vecind[i].size();
141  }
142  veci.resize(kmax);vecj.resize(kmax);
143  vector<int> pos(n); /* keep track of positions in individual index vectors */
144  for(int i=0;i<n;i++){pos(i)=0;};
145  if(config.trace.parallel) std::cout << "Hessian number of non-zeros:\n";
146  for(int i=0;i<n;i++){
147  if(config.trace.parallel) std::cout << "nnz = " << vecind(i).size() << "\n";
148  };
149  vector<size_t> value(n); /* value corresponding to pos */
150  int k=0; /* Incremented for each unique value */
151  size_t m; /* Hold current minimum value */
152  size_t inf=-1; /* size_t is unsigned - so -1 should give the largest possible size_t... */
153  int rowk=-1,colk=-1; /* -Wall */
154  while(true){
155  for(int i=0;i<n;i++){if(pos(i)<vecind(i).size())value(i)=vecind(i)[pos(i)];else value(i)=inf;}
156  m=value(0);
157  for(int i=0;i<n;i++){if(value(i)<m)m=value(i);}
158  if(m==inf)break;
159  for(int i=0;i<n;i++){
160  if(value(i)==m){
161  vecind(i)[pos(i)]=k;
162  rowk=(H[i]->i)[pos(i)];
163  colk=(H[i]->j)[pos(i)];
164  pos(i)++;
165  }
166  }
167  veci[k]=rowk;
168  vecj[k]=colk;
169  k++;
170  }
171  range=k;
172  //veci.resize(k);vecj.resize(k);
173  veci.conservativeResize(k);vecj.conservativeResize(k);
174  };
175  /* Destructor */
176  ~parallelADFun(){
177  if(config.trace.parallel) std::cout << "Free parallelADFun object.\n";
178  for(int i=0;i<vecpf.size();i++){
179  delete vecpf[i];
180  }
181  }
182  /* Convenience: convert this object to sphess like object */
183  sphess_t<parallelADFun<double> > convert(){
184  sphess_t<parallelADFun<double> > ans(this,veci,vecj);
185  return ans;
186  }
187  /* Subset of vector x to indices of tape number "tapeid" */
188  template <typename VectorBase>
189  VectorBase subset(const VectorBase& x, size_t tapeid, int p=1){
190  VectorBase y;
191  y.resize(vecind(tapeid).size()*p);
192  for(int i=0;i<(int)y.size()/p;i++)
193  for(int j=0;j<p;j++)
194  {y[i*p+j]=x[vecind(tapeid)[i]*p+j];}
195  return y;
196  }
197  /* Inverse operation of the subset above */
198  template <typename VectorBase>
199  void addinsert(VectorBase& x, const VectorBase& y, size_t tapeid, int p=1){
200  for(int i=0;i<(int)y.size()/p;i++)
201  for(int j=0;j<p;j++)
202  {x[vecind(tapeid)[i]*p+j]+=y[i*p+j];}
203  }
204 
205  /* Overload methods */
206  size_t Domain(){return domain;}
207  size_t Range(){return range;}
208 
209 #ifdef CPPAD_FRAMEWORK
210  /* p=order, p+1 taylorcoefficients per variable, x=domain vector
211  x contains p'th order taylor coefficients of input (length n).
212  Output contains (p+1)'th order taylor coefficients (length m).
213  =====> output = vector of length m (m=range dim)
214  */
215  template <typename VectorBase>
216  VectorBase Forward(size_t p, const VectorBase& x, std::ostream& s = std::cout){
217  vector<VectorBase> ans(ntapes);
218 #ifdef _OPENMP
219 #pragma omp parallel for num_threads(config.nthreads)
220 #endif
221  for(int i=0;i<ntapes;i++)ans(i) = vecpf(i)->Forward(p,x);
222  VectorBase out(range);
223  for(size_t i=0;i<range;i++)out(i)=0;
224  for(int i=0;i<ntapes;i++)addinsert(out,ans(i),i);
225  return out;
226  }
227  /* p=number of taylor coefs per variable (fastest running in output vector).
228  v=rangeweight vector. Can be either of length m or m*p. (m=range dim)
229  output=vector of length p*n (n=domain dim).
230  */
231  template <typename VectorBase>
232  VectorBase Reverse(size_t p, const VectorBase &v){
233  vector<VectorBase> ans(ntapes);
234 #ifdef _OPENMP
235 #pragma omp parallel for num_threads(config.nthreads)
236 #endif
237  for(int i=0;i<ntapes;i++)ans(i) = vecpf(i)->Reverse(p,subset(v,i));
238  VectorBase out(p*domain);
239  for(size_t i=0;i<p*domain;i++)out(i)=0;
240  for(int i=0;i<ntapes;i++)out=out+ans(i);
241  return out;
242  }
243  template <typename VectorBase>
244  VectorBase Jacobian(const VectorBase &x){
245  vector<VectorBase> ans(ntapes);
246 #ifdef _OPENMP
247 #pragma omp parallel for num_threads(config.nthreads)
248 #endif
249  for(int i=0;i<ntapes;i++)ans(i) = vecpf(i)->Jacobian(x);
250  VectorBase out( domain * range ); // domain fastest running
251  out.setZero();
252  for(int i=0;i<ntapes;i++)addinsert(out,ans(i),i,domain);
253  return out;
254  }
255  template <typename VectorBase>
256  VectorBase Hessian(const VectorBase &x, size_t rangecomponent){
257  vector<VectorBase> ans(ntapes);
258 #ifdef _OPENMP
259 #pragma omp parallel for num_threads(config.nthreads)
260 #endif
261  for(int i=0;i<ntapes;i++)ans(i) = vecpf(i)->Hessian(x,rangecomponent);
262  VectorBase out( domain * domain );
263  out.setZero();
264  for(int i=0;i<ntapes;i++)addinsert(out,ans(i),i,domain*domain);
265  return out;
266  }
267  /* optimize ADFun object */
268  void optimize(){
269  if(config.trace.optimize)std::cout << "Optimizing parallel tape... ";
270 #ifdef _OPENMP
271 #pragma omp parallel for num_threads(config.nthreads) if (config.optimize.parallel)
272 #endif
273  for(int i=0;i<ntapes;i++)vecpf(i)->optimize();
274  if(config.trace.optimize)std::cout << "Done\n";
275  }
276 #endif // CPPAD_FRAMEWORK
277 
278 #ifdef TMBAD_FRAMEWORK
279  void unset_tail() {
280  for(int i=0; i<ntapes; i++) vecpf(i) -> unset_tail();
281  }
282  void set_tail(const std::vector<TMBad::Index> &r) {
283 #ifdef _OPENMP
284 #pragma omp parallel for num_threads(config.nthreads)
285 #endif
286  for(int i=0; i<ntapes; i++) vecpf(i) -> set_tail(r);
287  }
288  void force_update() {
289 #ifdef _OPENMP
290 #pragma omp parallel for num_threads(config.nthreads)
291 #endif
292  for(int i=0; i<ntapes; i++) vecpf(i) -> force_update();
293  }
294  vector<double> operator()(const std::vector<double> &x) {
295  vector<vector<double> > ans(ntapes);
296 #ifdef _OPENMP
297 #pragma omp parallel for num_threads(config.nthreads)
298 #endif
299  for(int i=0; i<ntapes; i++)
300  ans(i) = vector<double>(vecpf(i)->operator()(x));
301  vector<double> out(range);
302  out.setZero();
303  for(int i=0; i<ntapes; i++) addinsert(out, ans(i), i);
304  return out;
305  }
306  vector<double> Jacobian(const std::vector<double> &x) {
307  vector<vector<double> > ans(ntapes);
308 #ifdef _OPENMP
309 #pragma omp parallel for num_threads(config.nthreads)
310 #endif
311  for(int i=0; i<ntapes; i++)
312  ans(i) = vector<double>(vecpf(i)->Jacobian(x));
313  vector<double> out( domain * range ); // domain fastest running
314  out.setZero();
315  for(int i=0; i<ntapes; i++) addinsert(out, ans(i), i, domain);
316  return out;
317  }
318  vector<double> Jacobian(const std::vector<double> &x,
319  const std::vector<bool> &keep_x,
320  const std::vector<bool> &keep_y ) {
321  vector<vector<double> > ans(ntapes);
322 #ifdef _OPENMP
323 #pragma omp parallel for num_threads(config.nthreads)
324 #endif
325  for(int i=0; i<ntapes; i++)
326  ans(i) = vector<double>(vecpf(i)->Jacobian(x, keep_x, subset(keep_y, i)));
327  // Calculate indices into row space of resulting jacobian (subset)
328  vector<vector<size_t> > vecind2(vecind.size());
329  std::vector<size_t> remap = TMBad::cumsum0<size_t> (keep_y);
330  for(int i=0; i<ntapes; i++) {
331  std::vector<bool>
332  sub = subset(keep_y, i); // Bool mask into vecind(i)
333  std::vector<size_t>
334  vecind_i(vecind(i));
335  std::vector<size_t>
336  vecind_i_sub = TMBad::subset(vecind_i, sub); // remaining vecind(i)
337  std::vector<size_t>
338  vecind_i_sub_remap = TMBad::subset(remap, vecind_i_sub); // Remap
339  vecind2(i) = vector<size_t> (vecind_i_sub_remap);
340  }
341  // Fill into result matrix
342  int dim_x = std::count(keep_x.begin(), keep_x.end(), true);
343  int dim_y = std::count(keep_y.begin(), keep_y.end(), true);
344  vector<double> out( dim_x * dim_y );
345  out.setZero();
346  std::swap(vecind, vecind2);
347  for (int i=0; i<ntapes; i++) addinsert(out, ans(i), i, dim_x);
348  std::swap(vecind, vecind2);
349  return out;
350  }
351  vector<double> Jacobian(const std::vector<double> &x,
352  const vector<double> &w) {
353  vector<vector<double> > ans(ntapes);
354 #ifdef _OPENMP
355 #pragma omp parallel for num_threads(config.nthreads)
356 #endif
357  for(int i=0; i<ntapes; i++)
358  ans(i) = vector<double>(vecpf(i)->Jacobian(x, subset(w, i)));
359  vector<double> out(domain);
360  out.setZero();
361  for(int i=0; i<ntapes; i++) out = out + ans(i);
362  return out;
363  }
364  // Used by tmbstan (tmb_forward and tmb_reverse)
365  // Assuming one-dimensional output.
366  template<class Vector>
367  Vector forward(const Vector &x) {
368  vector<Vector> ans(ntapes);
369 #ifdef _OPENMP
370 #pragma omp parallel for num_threads(config.nthreads)
371 #endif
372  for(int i=0; i<ntapes; i++) ans(i) = vecpf(i)->forward(x);
373  Vector out(1);
374  out.setZero();
375  for(int i=0; i<ntapes; i++) out = out + ans(i);
376  return out;
377  }
378  template<class Vector>
379  Vector reverse(const Vector &w) {
380  vector<Vector> ans(ntapes);
381 #ifdef _OPENMP
382 #pragma omp parallel for num_threads(config.nthreads)
383 #endif
384  for(int i=0; i<ntapes; i++) ans(i) = vecpf(i)->reverse(w);
385  Vector out(domain);
386  out.setZero();
387  for(int i=0; i<ntapes; i++) out = out + ans(i);
388  return out;
389  }
390 #endif // TMBAD_FRAMEWORK
391 };
392 
std::vector< T > subset(const std::vector< T > &x, const std::vector< bool > &y)
Vector subset by boolean mask.
Vector class used by TMB.
Definition: vector.hpp:17
bool optimize
Trace tape optimization.
Definition: config.hpp:30
int nthreads
Number of OpenMP threads to use (if OpenMP is enabled)
Definition: config.hpp:50
bool parallel
Trace info from parallel for loops.
Definition: config.hpp:29
License: GPL v2