1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 14 // Third party copyrights are property of their respective owners. 15 // 16 // Redistribution and use in source and binary forms, with or without modification, 17 // are permitted provided that the following conditions are met: 18 // 19 // * Redistribution's of source code must retain the above copyright notice, 20 // this list of conditions and the following disclaimer. 21 // 22 // * Redistribution's in binary form must reproduce the above copyright notice, 23 // this list of conditions and the following disclaimer in the documentation 24 // and/or other materials provided with the distribution. 25 // 26 // * The name of the copyright holders may not be used to endorse or promote products 27 // derived from this software without specific prior written permission. 28 // 29 // This software is provided by the copyright holders and contributors "as is" and 30 // any express or implied warranties, including, but not limited to, the implied 31 // warranties of merchantability and fitness for a particular purpose are disclaimed. 32 // In no event shall the Intel Corporation or contributors be liable for any direct, 33 // indirect, incidental, special, exemplary, or consequential damages 34 // (including, but not limited to, procurement of substitute goods or services; 35 // loss of use, data, or profits; or business interruption) however caused 36 // and on any theory of liability, whether in contract, strict liability, 37 // or tort (including negligence or otherwise) arising in any way out of 38 // the use of this software, even if advised of the possibility of such damage. 39 // 40 //M*/ 41 42 #ifndef OPENCV_DNN_DNN_ALL_LAYERS_HPP 43 #define OPENCV_DNN_DNN_ALL_LAYERS_HPP 44 #include <opencv2/dnn.hpp> 45 46 namespace cv { 47 namespace dnn { 48 CV__DNN_EXPERIMENTAL_NS_BEGIN 49 //! @addtogroup dnn 50 //! @{ 51 52 /** @defgroup dnnLayerList Partial List of Implemented Layers 53 @{ 54 This subsection of dnn module contains information about built-in layers and their descriptions. 55 56 Classes listed here, in fact, provides C++ API for creating instances of built-in layers. 57 In addition to this way of layers instantiation, there is a more common factory API (see @ref dnnLayerFactory), it allows to create layers dynamically (by name) and register new ones. 58 You can use both API, but factory API is less convenient for native C++ programming and basically designed for use inside importers (see @ref readNetFromCaffe(), @ref readNetFromTorch(), @ref readNetFromTensorflow()). 59 60 Built-in layers partially reproduce functionality of corresponding Caffe and Torch7 layers. 61 In particular, the following layers and Caffe importer were tested to reproduce <a href="http://caffe.berkeleyvision.org/tutorial/layers.html">Caffe</a> functionality: 62 - Convolution 63 - Deconvolution 64 - Pooling 65 - InnerProduct 66 - TanH, ReLU, Sigmoid, BNLL, Power, AbsVal 67 - Softmax 68 - Reshape, Flatten, Slice, Split 69 - LRN 70 - MVN 71 - Dropout (since it does nothing on forward pass -)) 72 */ 73 74 class CV_EXPORTS BlankLayer : public Layer 75 { 76 public: 77 static Ptr<Layer> create(const LayerParams ¶ms); 78 }; 79 80 /** 81 * Constant layer produces the same data blob at an every forward pass. 82 */ 83 class CV_EXPORTS ConstLayer : public Layer 84 { 85 public: 86 static Ptr<Layer> create(const LayerParams ¶ms); 87 }; 88 89 //! LSTM recurrent layer 90 class CV_EXPORTS LSTMLayer : public Layer 91 { 92 public: 93 /** Creates instance of LSTM layer */ 94 static Ptr<LSTMLayer> create(const LayerParams& params); 95 96 /** @deprecated Use LayerParams::blobs instead. 97 @brief Set trained weights for LSTM layer. 98 99 LSTM behavior on each step is defined by current input, previous output, previous cell state and learned weights. 100 101 Let @f$x_t@f$ be current input, @f$h_t@f$ be current output, @f$c_t@f$ be current state. 102 Than current output and current cell state is computed as follows: 103 @f{eqnarray*}{ 104 h_t &= o_t \odot tanh(c_t), \\ 105 c_t &= f_t \odot c_{t-1} + i_t \odot g_t, \\ 106 @f} 107 where @f$\odot@f$ is per-element multiply operation and @f$i_t, f_t, o_t, g_t@f$ is internal gates that are computed using learned wights. 108 109 Gates are computed as follows: 110 @f{eqnarray*}{ 111 i_t &= sigmoid&(W_{xi} x_t + W_{hi} h_{t-1} + b_i), \\ 112 f_t &= sigmoid&(W_{xf} x_t + W_{hf} h_{t-1} + b_f), \\ 113 o_t &= sigmoid&(W_{xo} x_t + W_{ho} h_{t-1} + b_o), \\ 114 g_t &= tanh &(W_{xg} x_t + W_{hg} h_{t-1} + b_g), \\ 115 @f} 116 where @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices: 117 @f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$. 118 119 For simplicity and performance purposes we use @f$ W_x = [W_{xi}; W_{xf}; W_{xo}, W_{xg}] @f$ 120 (i.e. @f$W_x@f$ is vertical concatenation of @f$ W_{x?} @f$), @f$ W_x \in R^{4N_h \times N_x} @f$. 121 The same for @f$ W_h = [W_{hi}; W_{hf}; W_{ho}, W_{hg}], W_h \in R^{4N_h \times N_h} @f$ 122 and for @f$ b = [b_i; b_f, b_o, b_g]@f$, @f$b \in R^{4N_h} @f$. 123 124 @param Wh is matrix defining how previous output is transformed to internal gates (i.e. according to above mentioned notation is @f$ W_h @f$) 125 @param Wx is matrix defining how current input is transformed to internal gates (i.e. according to above mentioned notation is @f$ W_x @f$) 126 @param b is bias vector (i.e. according to above mentioned notation is @f$ b @f$) 127 */ 128 CV_DEPRECATED virtual void setWeights(const Mat &Wh, const Mat &Wx, const Mat &b) = 0; 129 130 /** @brief Specifies shape of output blob which will be [[`T`], `N`] + @p outTailShape. 131 * @details If this parameter is empty or unset then @p outTailShape = [`Wh`.size(0)] will be used, 132 * where `Wh` is parameter from setWeights(). 133 */ 134 virtual void setOutShape(const MatShape &outTailShape = MatShape()) = 0; 135 136 /** @deprecated Use flag `produce_cell_output` in LayerParams. 137 * @brief Specifies either interpret first dimension of input blob as timestamp dimenion either as sample. 138 * 139 * If flag is set to true then shape of input blob will be interpreted as [`T`, `N`, `[data dims]`] where `T` specifies number of timestamps, `N` is number of independent streams. 140 * In this case each forward() call will iterate through `T` timestamps and update layer's state `T` times. 141 * 142 * If flag is set to false then shape of input blob will be interpreted as [`N`, `[data dims]`]. 143 * In this case each forward() call will make one iteration and produce one timestamp with shape [`N`, `[out dims]`]. 144 */ 145 CV_DEPRECATED virtual void setUseTimstampsDim(bool use = true) = 0; 146 147 /** @deprecated Use flag `use_timestamp_dim` in LayerParams. 148 * @brief If this flag is set to true then layer will produce @f$ c_t @f$ as second output. 149 * @details Shape of the second output is the same as first output. 150 */ 151 CV_DEPRECATED virtual void setProduceCellOutput(bool produce = false) = 0; 152 153 /* In common case it use single input with @f$x_t@f$ values to compute output(s) @f$h_t@f$ (and @f$c_t@f$). 154 * @param input should contain packed values @f$x_t@f$ 155 * @param output contains computed outputs: @f$h_t@f$ (and @f$c_t@f$ if setProduceCellOutput() flag was set to true). 156 * 157 * If setUseTimstampsDim() is set to true then @p input[0] should has at least two dimensions with the following shape: [`T`, `N`, `[data dims]`], 158 * where `T` specifies number of timestamps, `N` is number of independent streams (i.e. @f$ x_{t_0 + t}^{stream} @f$ is stored inside @p input[0][t, stream, ...]). 159 * 160 * If setUseTimstampsDim() is set to false then @p input[0] should contain single timestamp, its shape should has form [`N`, `[data dims]`] with at least one dimension. 161 * (i.e. @f$ x_{t}^{stream} @f$ is stored inside @p input[0][stream, ...]). 162 */ 163 164 int inputNameToIndex(String inputName) CV_OVERRIDE; 165 int outputNameToIndex(const String& outputName) CV_OVERRIDE; 166 }; 167 168 /** @brief Classical recurrent layer 169 170 Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$. 171 172 - input: should contain packed input @f$x_t@f$. 173 - output: should contain output @f$o_t@f$ (and @f$h_t@f$ if setProduceHiddenOutput() is set to true). 174 175 input[0] should have shape [`T`, `N`, `data_dims`] where `T` and `N` is number of timestamps and number of independent samples of @f$x_t@f$ respectively. 176 177 output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix. 178 179 If setProduceHiddenOutput() is set to true then @p output[1] will contain a Mat with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix. 180 */ 181 class CV_EXPORTS RNNLayer : public Layer 182 { 183 public: 184 /** Creates instance of RNNLayer */ 185 static Ptr<RNNLayer> create(const LayerParams& params); 186 187 /** Setups learned weights. 188 189 Recurrent-layer behavior on each step is defined by current input @f$ x_t @f$, previous state @f$ h_t @f$ and learned weights as follows: 190 @f{eqnarray*}{ 191 h_t &= tanh&(W_{hh} h_{t-1} + W_{xh} x_t + b_h), \\ 192 o_t &= tanh&(W_{ho} h_t + b_o), 193 @f} 194 195 @param Wxh is @f$ W_{xh} @f$ matrix 196 @param bh is @f$ b_{h} @f$ vector 197 @param Whh is @f$ W_{hh} @f$ matrix 198 @param Who is @f$ W_{xo} @f$ matrix 199 @param bo is @f$ b_{o} @f$ vector 200 */ 201 virtual void setWeights(const Mat &Wxh, const Mat &bh, const Mat &Whh, const Mat &Who, const Mat &bo) = 0; 202 203 /** @brief If this flag is set to true then layer will produce @f$ h_t @f$ as second output. 204 * @details Shape of the second output is the same as first output. 205 */ 206 virtual void setProduceHiddenOutput(bool produce = false) = 0; 207 208 }; 209 210 class CV_EXPORTS BaseConvolutionLayer : public Layer 211 { 212 public: 213 Size kernel, stride, pad, dilation, adjustPad; 214 String padMode; 215 int numOutput; 216 }; 217 218 class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer 219 { 220 public: 221 static Ptr<BaseConvolutionLayer> create(const LayerParams& params); 222 }; 223 224 class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer 225 { 226 public: 227 static Ptr<BaseConvolutionLayer> create(const LayerParams& params); 228 }; 229 230 class CV_EXPORTS LRNLayer : public Layer 231 { 232 public: 233 int type; 234 235 int size; 236 float alpha, beta, bias; 237 bool normBySize; 238 239 static Ptr<LRNLayer> create(const LayerParams& params); 240 }; 241 242 class CV_EXPORTS PoolingLayer : public Layer 243 { 244 public: 245 int type; 246 Size kernel, stride; 247 int pad_l, pad_t, pad_r, pad_b; 248 CV_DEPRECATED_EXTERNAL Size pad; 249 bool globalPooling; 250 bool computeMaxIdx; 251 String padMode; 252 bool ceilMode; 253 // If true for average pooling with padding, divide an every output region 254 // by a whole kernel area. Otherwise exclude zero padded values and divide 255 // by number of real values. 256 bool avePoolPaddedArea; 257 // ROIPooling parameters. 258 Size pooledSize; 259 float spatialScale; 260 // PSROIPooling parameters. 261 int psRoiOutChannels; 262 263 static Ptr<PoolingLayer> create(const LayerParams& params); 264 }; 265 266 class CV_EXPORTS SoftmaxLayer : public Layer 267 { 268 public: 269 bool logSoftMax; 270 271 static Ptr<SoftmaxLayer> create(const LayerParams& params); 272 }; 273 274 class CV_EXPORTS InnerProductLayer : public Layer 275 { 276 public: 277 int axis; 278 static Ptr<InnerProductLayer> create(const LayerParams& params); 279 }; 280 281 class CV_EXPORTS MVNLayer : public Layer 282 { 283 public: 284 float eps; 285 bool normVariance, acrossChannels; 286 287 static Ptr<MVNLayer> create(const LayerParams& params); 288 }; 289 290 /* Reshaping */ 291 292 class CV_EXPORTS ReshapeLayer : public Layer 293 { 294 public: 295 MatShape newShapeDesc; 296 Range newShapeRange; 297 298 static Ptr<ReshapeLayer> create(const LayerParams& params); 299 }; 300 301 class CV_EXPORTS FlattenLayer : public Layer 302 { 303 public: 304 static Ptr<FlattenLayer> create(const LayerParams ¶ms); 305 }; 306 307 class CV_EXPORTS ConcatLayer : public Layer 308 { 309 public: 310 int axis; 311 /** 312 * @brief Add zero padding in case of concatenation of blobs with different 313 * spatial sizes. 314 * 315 * Details: https://github.com/torch/nn/blob/master/doc/containers.md#depthconcat 316 */ 317 bool padding; 318 319 static Ptr<ConcatLayer> create(const LayerParams ¶ms); 320 }; 321 322 class CV_EXPORTS SplitLayer : public Layer 323 { 324 public: 325 int outputsCount; //!< Number of copies that will be produced (is ignored when negative). 326 327 static Ptr<SplitLayer> create(const LayerParams ¶ms); 328 }; 329 330 /** 331 * Slice layer has several modes: 332 * 1. Caffe mode 333 * @param[in] axis Axis of split operation 334 * @param[in] slice_point Array of split points 335 * 336 * Number of output blobs equals to number of split points plus one. The 337 * first blob is a slice on input from 0 to @p slice_point[0] - 1 by @p axis, 338 * the second output blob is a slice of input from @p slice_point[0] to 339 * @p slice_point[1] - 1 by @p axis and the last output blob is a slice of 340 * input from @p slice_point[-1] up to the end of @p axis size. 341 * 342 * 2. TensorFlow mode 343 * @param begin Vector of start indices 344 * @param size Vector of sizes 345 * 346 * More convenient numpy-like slice. One and only output blob 347 * is a slice `input[begin[0]:begin[0]+size[0], begin[1]:begin[1]+size[1], ...]` 348 * 349 * 3. Torch mode 350 * @param axis Axis of split operation 351 * 352 * Split input blob on the equal parts by @p axis. 353 */ 354 class CV_EXPORTS SliceLayer : public Layer 355 { 356 public: 357 /** 358 * @brief Vector of slice ranges. 359 * 360 * The first dimension equals number of output blobs. 361 * Inner vector has slice ranges for the first number of input dimensions. 362 */ 363 std::vector<std::vector<Range> > sliceRanges; 364 int axis; 365 366 static Ptr<SliceLayer> create(const LayerParams ¶ms); 367 }; 368 369 class CV_EXPORTS PermuteLayer : public Layer 370 { 371 public: 372 static Ptr<PermuteLayer> create(const LayerParams& params); 373 }; 374 375 /** 376 * Permute channels of 4-dimensional input blob. 377 * @param group Number of groups to split input channels and pick in turns 378 * into output blob. 379 * 380 * \f[ groupSize = \frac{number\ of\ channels}{group} \f] 381 * \f[ output(n, c, h, w) = input(n, groupSize \times (c \% group) + \lfloor \frac{c}{group} \rfloor, h, w) \f] 382 * Read more at https://arxiv.org/pdf/1707.01083.pdf 383 */ 384 class CV_EXPORTS ShuffleChannelLayer : public Layer 385 { 386 public: 387 static Ptr<Layer> create(const LayerParams& params); 388 389 int group; 390 }; 391 392 /** 393 * @brief Adds extra values for specific axes. 394 * @param paddings Vector of paddings in format 395 * @code 396 * [ pad_before, pad_after, // [0]th dimension 397 * pad_before, pad_after, // [1]st dimension 398 * ... 399 * pad_before, pad_after ] // [n]th dimension 400 * @endcode 401 * that represents number of padded values at every dimension 402 * starting from the first one. The rest of dimensions won't 403 * be padded. 404 * @param value Value to be padded. Defaults to zero. 405 * @param type Padding type: 'constant', 'reflect' 406 * @param input_dims Torch's parameter. If @p input_dims is not equal to the 407 * actual input dimensionality then the `[0]th` dimension 408 * is considered as a batch dimension and @p paddings are shifted 409 * to a one dimension. Defaults to `-1` that means padding 410 * corresponding to @p paddings. 411 */ 412 class CV_EXPORTS PaddingLayer : public Layer 413 { 414 public: 415 static Ptr<PaddingLayer> create(const LayerParams& params); 416 }; 417 418 /* Activations */ 419 class CV_EXPORTS ActivationLayer : public Layer 420 { 421 public: 422 virtual void forwardSlice(const float* src, float* dst, int len, 423 size_t outPlaneSize, int cn0, int cn1) const = 0; 424 }; 425 426 class CV_EXPORTS ReLULayer : public ActivationLayer 427 { 428 public: 429 float negativeSlope; 430 431 static Ptr<ReLULayer> create(const LayerParams ¶ms); 432 }; 433 434 class CV_EXPORTS ReLU6Layer : public ActivationLayer 435 { 436 public: 437 float minValue, maxValue; 438 439 static Ptr<ReLU6Layer> create(const LayerParams ¶ms); 440 }; 441 442 class CV_EXPORTS ChannelsPReLULayer : public ActivationLayer 443 { 444 public: 445 static Ptr<Layer> create(const LayerParams& params); 446 }; 447 448 class CV_EXPORTS ELULayer : public ActivationLayer 449 { 450 public: 451 static Ptr<ELULayer> create(const LayerParams ¶ms); 452 }; 453 454 class CV_EXPORTS TanHLayer : public ActivationLayer 455 { 456 public: 457 static Ptr<TanHLayer> create(const LayerParams ¶ms); 458 }; 459 460 class CV_EXPORTS SigmoidLayer : public ActivationLayer 461 { 462 public: 463 static Ptr<SigmoidLayer> create(const LayerParams ¶ms); 464 }; 465 466 class CV_EXPORTS BNLLLayer : public ActivationLayer 467 { 468 public: 469 static Ptr<BNLLLayer> create(const LayerParams ¶ms); 470 }; 471 472 class CV_EXPORTS AbsLayer : public ActivationLayer 473 { 474 public: 475 static Ptr<AbsLayer> create(const LayerParams ¶ms); 476 }; 477 478 class CV_EXPORTS PowerLayer : public ActivationLayer 479 { 480 public: 481 float power, scale, shift; 482 483 static Ptr<PowerLayer> create(const LayerParams ¶ms); 484 }; 485 486 /* Layers used in semantic segmentation */ 487 488 class CV_EXPORTS CropLayer : public Layer 489 { 490 public: 491 int startAxis; 492 std::vector<int> offset; 493 494 static Ptr<CropLayer> create(const LayerParams ¶ms); 495 }; 496 497 class CV_EXPORTS EltwiseLayer : public Layer 498 { 499 public: 500 static Ptr<EltwiseLayer> create(const LayerParams ¶ms); 501 }; 502 503 class CV_EXPORTS BatchNormLayer : public ActivationLayer 504 { 505 public: 506 bool hasWeights, hasBias; 507 float epsilon; 508 509 static Ptr<BatchNormLayer> create(const LayerParams ¶ms); 510 }; 511 512 class CV_EXPORTS MaxUnpoolLayer : public Layer 513 { 514 public: 515 Size poolKernel; 516 Size poolPad; 517 Size poolStride; 518 519 static Ptr<MaxUnpoolLayer> create(const LayerParams ¶ms); 520 }; 521 522 class CV_EXPORTS ScaleLayer : public Layer 523 { 524 public: 525 bool hasBias; 526 int axis; 527 528 static Ptr<ScaleLayer> create(const LayerParams& params); 529 }; 530 531 class CV_EXPORTS ShiftLayer : public Layer 532 { 533 public: 534 static Ptr<Layer> create(const LayerParams& params); 535 }; 536 537 class CV_EXPORTS PriorBoxLayer : public Layer 538 { 539 public: 540 static Ptr<PriorBoxLayer> create(const LayerParams& params); 541 }; 542 543 class CV_EXPORTS ReorgLayer : public Layer 544 { 545 public: 546 static Ptr<ReorgLayer> create(const LayerParams& params); 547 }; 548 549 class CV_EXPORTS RegionLayer : public Layer 550 { 551 public: 552 static Ptr<RegionLayer> create(const LayerParams& params); 553 }; 554 555 class CV_EXPORTS DetectionOutputLayer : public Layer 556 { 557 public: 558 static Ptr<DetectionOutputLayer> create(const LayerParams& params); 559 }; 560 561 /** 562 * @brief \f$ L_p \f$ - normalization layer. 563 * @param p Normalization factor. The most common `p = 1` for \f$ L_1 \f$ - 564 * normalization or `p = 2` for \f$ L_2 \f$ - normalization or a custom one. 565 * @param eps Parameter \f$ \epsilon \f$ to prevent a division by zero. 566 * @param across_spatial If true, normalize an input across all non-batch dimensions. 567 * Otherwise normalize an every channel separately. 568 * 569 * Across spatial: 570 * @f[ 571 * norm = \sqrt[p]{\epsilon + \sum_{x, y, c} |src(x, y, c)|^p } \\ 572 * dst(x, y, c) = \frac{ src(x, y, c) }{norm} 573 * @f] 574 * 575 * Channel wise normalization: 576 * @f[ 577 * norm(c) = \sqrt[p]{\epsilon + \sum_{x, y} |src(x, y, c)|^p } \\ 578 * dst(x, y, c) = \frac{ src(x, y, c) }{norm(c)} 579 * @f] 580 * 581 * Where `x, y` - spatial coordinates, `c` - channel. 582 * 583 * An every sample in the batch is normalized separately. Optionally, 584 * output is scaled by the trained parameters. 585 */ 586 class CV_EXPORTS NormalizeBBoxLayer : public Layer 587 { 588 public: 589 float pnorm, epsilon; 590 CV_DEPRECATED_EXTERNAL bool acrossSpatial; 591 592 static Ptr<NormalizeBBoxLayer> create(const LayerParams& params); 593 }; 594 595 /** 596 * @brief Resize input 4-dimensional blob by nearest neighbor or bilinear strategy. 597 * 598 * Layer is used to support TensorFlow's resize_nearest_neighbor and resize_bilinear ops. 599 */ 600 class CV_EXPORTS ResizeLayer : public Layer 601 { 602 public: 603 static Ptr<ResizeLayer> create(const LayerParams& params); 604 }; 605 606 /** 607 * @brief Bilinear resize layer from https://github.com/cdmh/deeplab-public 608 * 609 * It differs from @ref ResizeLayer in output shape and resize scales computations. 610 */ 611 class CV_EXPORTS InterpLayer : public Layer 612 { 613 public: 614 static Ptr<Layer> create(const LayerParams& params); 615 }; 616 617 class CV_EXPORTS ProposalLayer : public Layer 618 { 619 public: 620 static Ptr<ProposalLayer> create(const LayerParams& params); 621 }; 622 623 class CV_EXPORTS CropAndResizeLayer : public Layer 624 { 625 public: 626 static Ptr<Layer> create(const LayerParams& params); 627 }; 628 629 //! @} 630 //! @} 631 CV__DNN_EXPERIMENTAL_NS_END 632 } 633 } 634 #endif 635