lib/main/STM32F1/Drivers/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q7.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        19. March 2015
   5 * $Revision:    V.1.4.5
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_q7.c
   9 *
  10 * Description:  Partial convolution of Q7 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup PartialConv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Partial convolution of Q7 sequences.
  54  * @param[in]       *pSrcA points to the first input sequence.
  55  * @param[in]       srcALen length of the first input sequence.
  56  * @param[in]       *pSrcB points to the second input sequence.
  57  * @param[in]       srcBLen length of the second input sequence.
  58  * @param[out]      *pDst points to the location where the output result is written.
  59  * @param[in]       firstIndex is the first output sample to start with.
  60  * @param[in]       numPoints is the number of output points to be computed.
  61  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  62  *
  63  * \par
  64  * Refer the function <code>arm_conv_partial_opt_q7()</code> for a faster implementation of this function.
  65  *
  66  */
  67
  68 arm_status arm_conv_partial_q7(
  69   q7_t * pSrcA,
  70   uint32_t srcALen,
  71   q7_t * pSrcB,
  72   uint32_t srcBLen,
  73   q7_t * pDst,
  74   uint32_t firstIndex,
  75   uint32_t numPoints)
  76 {
  77
  78
  79 #ifndef ARM_MATH_CM0_FAMILY
  80
  81   /* Run the below code for Cortex-M4 and Cortex-M3 */
  82
  83   q7_t *pIn1;                                    /* inputA pointer */
  84   q7_t *pIn2;                                    /* inputB pointer */
  85   q7_t *pOut = pDst;                             /* output pointer */
  86   q7_t *px;                                      /* Intermediate inputA pointer */
  87   q7_t *py;                                      /* Intermediate inputB pointer */
  88   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
  89   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  90   q31_t input1, input2;
  91   q15_t in1, in2;
  92   q7_t x0, x1, x2, x3, c0, c1;
  93   uint32_t j, k, count, check, blkCnt;
  94   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter */
  95   arm_status status;
  96
  97
  98   /* Check for range of output samples to be calculated */
  99   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 100   {
 101     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 102     status = ARM_MATH_ARGUMENT_ERROR;
 103   }
 104   else
 105   {
 106
 107     /* The algorithm implementation is based on the lengths of the inputs. */
 108     /* srcB is always made to slide across srcA. */
 109     /* So srcBLen is always considered as shorter or equal to srcALen */
 110     if(srcALen >= srcBLen)
 111     {
 112       /* Initialization of inputA pointer */
 113       pIn1 = pSrcA;
 114
 115       /* Initialization of inputB pointer */
 116       pIn2 = pSrcB;
 117     }
 118     else
 119     {
 120       /* Initialization of inputA pointer */
 121       pIn1 = pSrcB;
 122
 123       /* Initialization of inputB pointer */
 124       pIn2 = pSrcA;
 125
 126       /* srcBLen is always considered as shorter or equal to srcALen */
 127       j = srcBLen;
 128       srcBLen = srcALen;
 129       srcALen = j;
 130     }
 131
 132     /* Conditions to check which loopCounter holds
 133      * the first and last indices of the output samples to be calculated. */
 134     check = firstIndex + numPoints;
 135     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
 136     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
 137     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
 138     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 139                                      (int32_t) numPoints) : 0;
 140     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
 141                                     (int32_t) firstIndex);
 142     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 143
 144     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 145     /* The function is internally
 146      * divided into three stages according to the number of multiplications that has to be
 147      * taken place between inputA samples and inputB samples. In the first stage of the
 148      * algorithm, the multiplications increase by one for every iteration.
 149      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 150      * In the third stage of the algorithm, the multiplications decrease by one
 151      * for every iteration. */
 152
 153     /* Set the output pointer to point to the firstIndex
 154      * of the output sample to be calculated. */
 155     pOut = pDst + firstIndex;
 156
 157     /* --------------------------
 158      * Initializations of stage1
 159      * -------------------------*/
 160
 161     /* sum = x[0] * y[0]
 162      * sum = x[0] * y[1] + x[1] * y[0]
 163      * ....
 164      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 165      */
 166
 167     /* In this stage the MAC operations are increased by 1 for every iteration.
 168        The count variable holds the number of MAC operations performed.
 169        Since the partial convolution starts from from firstIndex
 170        Number of Macs to be performed is firstIndex + 1 */
 171     count = 1u + firstIndex;
 172
 173     /* Working pointer of inputA */
 174     px = pIn1;
 175
 176     /* Working pointer of inputB */
 177     pSrc2 = pIn2 + firstIndex;
 178     py = pSrc2;
 179
 180     /* ------------------------
 181      * Stage1 process
 182      * ----------------------*/
 183
 184     /* The first stage starts here */
 185     while(blockSize1 > 0)
 186     {
 187       /* Accumulator is made zero for every iteration */
 188       sum = 0;
 189
 190       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 191       k = count >> 2u;
 192
 193       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 194        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 195       while(k > 0u)
 196       {
 197         /* x[0] , x[1] */
 198         in1 = (q15_t) * px++;
 199         in2 = (q15_t) * px++;
 200         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 201
 202         /* y[srcBLen - 1] , y[srcBLen - 2] */
 203         in1 = (q15_t) * py--;
 204         in2 = (q15_t) * py--;
 205         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 206
 207         /* x[0] * y[srcBLen - 1] */
 208         /* x[1] * y[srcBLen - 2] */
 209         sum = __SMLAD(input1, input2, sum);
 210
 211         /* x[2] , x[3] */
 212         in1 = (q15_t) * px++;
 213         in2 = (q15_t) * px++;
 214         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 215
 216         /* y[srcBLen - 3] , y[srcBLen - 4] */
 217         in1 = (q15_t) * py--;
 218         in2 = (q15_t) * py--;
 219         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 220
 221         /* x[2] * y[srcBLen - 3] */
 222         /* x[3] * y[srcBLen - 4] */
 223         sum = __SMLAD(input1, input2, sum);
 224
 225         /* Decrement the loop counter */
 226         k--;
 227       }
 228
 229       /* If the count is not a multiple of 4, compute any remaining MACs here.
 230        ** No loop unrolling is used. */
 231       k = count % 0x4u;
 232
 233       while(k > 0u)
 234       {
 235         /* Perform the multiply-accumulates */
 236         sum += ((q31_t) * px++ * *py--);
 237
 238         /* Decrement the loop counter */
 239         k--;
 240       }
 241
 242       /* Store the result in the accumulator in the destination buffer. */
 243       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
 244
 245       /* Update the inputA and inputB pointers for next MAC calculation */
 246       py = ++pSrc2;
 247       px = pIn1;
 248
 249       /* Increment the MAC count */
 250       count++;
 251
 252       /* Decrement the loop counter */
 253       blockSize1--;
 254     }
 255
 256     /* --------------------------
 257      * Initializations of stage2
 258      * ------------------------*/
 259
 260     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 261      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 262      * ....
 263      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 264      */
 265
 266     /* Working pointer of inputA */
 267     if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
 268     {
 269       px = pIn1 + firstIndex - srcBLen + 1;
 270     }
 271     else
 272     {
 273       px = pIn1;
 274     }
 275
 276     /* Working pointer of inputB */
 277     pSrc2 = pIn2 + (srcBLen - 1u);
 278     py = pSrc2;
 279
 280     /* count is index by which the pointer pIn1 to be incremented */
 281     count = 0u;
 282
 283     /* -------------------
 284      * Stage2 process
 285      * ------------------*/
 286
 287     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 288      * So, to loop unroll over blockSize2,
 289      * srcBLen should be greater than or equal to 4 */
 290     if(srcBLen >= 4u)
 291     {
 292       /* Loop unroll over blockSize2, by 4 */
 293       blkCnt = ((uint32_t) blockSize2 >> 2u);
 294
 295       while(blkCnt > 0u)
 296       {
 297         /* Set all accumulators to zero */
 298         acc0 = 0;
 299         acc1 = 0;
 300         acc2 = 0;
 301         acc3 = 0;
 302
 303         /* read x[0], x[1], x[2] samples */
 304         x0 = *(px++);
 305         x1 = *(px++);
 306         x2 = *(px++);
 307
 308         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 309         k = srcBLen >> 2u;
 310
 311         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 312          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 313         do
 314         {
 315           /* Read y[srcBLen - 1] sample */
 316           c0 = *(py--);
 317           /* Read y[srcBLen - 2] sample */
 318           c1 = *(py--);
 319
 320           /* Read x[3] sample */
 321           x3 = *(px++);
 322
 323           /* x[0] and x[1] are packed */
 324           in1 = (q15_t) x0;
 325           in2 = (q15_t) x1;
 326
 327           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 328
 329           /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
 330           in1 = (q15_t) c0;
 331           in2 = (q15_t) c1;
 332
 333           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 334
 335           /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
 336           acc0 = __SMLAD(input1, input2, acc0);
 337
 338           /* x[1] and x[2] are packed */
 339           in1 = (q15_t) x1;
 340           in2 = (q15_t) x2;
 341
 342           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 343
 344           /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
 345           acc1 = __SMLAD(input1, input2, acc1);
 346
 347           /* x[2] and x[3] are packed */
 348           in1 = (q15_t) x2;
 349           in2 = (q15_t) x3;
 350
 351           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 352
 353           /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
 354           acc2 = __SMLAD(input1, input2, acc2);
 355
 356           /* Read x[4] sample */
 357           x0 = *(px++);
 358
 359           /* x[3] and x[4] are packed */
 360           in1 = (q15_t) x3;
 361           in2 = (q15_t) x0;
 362
 363           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 364
 365           /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
 366           acc3 = __SMLAD(input1, input2, acc3);
 367
 368           /* Read y[srcBLen - 3] sample */
 369           c0 = *(py--);
 370           /* Read y[srcBLen - 4] sample */
 371           c1 = *(py--);
 372
 373           /* Read x[5] sample */
 374           x1 = *(px++);
 375
 376           /* x[2] and x[3] are packed */
 377           in1 = (q15_t) x2;
 378           in2 = (q15_t) x3;
 379
 380           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 381
 382           /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
 383           in1 = (q15_t) c0;
 384           in2 = (q15_t) c1;
 385
 386           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 387
 388           /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
 389           acc0 = __SMLAD(input1, input2, acc0);
 390
 391           /* x[3] and x[4] are packed */
 392           in1 = (q15_t) x3;
 393           in2 = (q15_t) x0;
 394
 395           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 396
 397           /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
 398           acc1 = __SMLAD(input1, input2, acc1);
 399
 400           /* x[4] and x[5] are packed */
 401           in1 = (q15_t) x0;
 402           in2 = (q15_t) x1;
 403
 404           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 405
 406           /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
 407           acc2 = __SMLAD(input1, input2, acc2);
 408
 409           /* Read x[6] sample */
 410           x2 = *(px++);
 411
 412           /* x[5] and x[6] are packed */
 413           in1 = (q15_t) x1;
 414           in2 = (q15_t) x2;
 415
 416           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 417
 418           /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
 419           acc3 = __SMLAD(input1, input2, acc3);
 420
 421         } while(--k);
 422
 423         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 424          ** No loop unrolling is used. */
 425         k = srcBLen % 0x4u;
 426
 427         while(k > 0u)
 428         {
 429           /* Read y[srcBLen - 5] sample */
 430           c0 = *(py--);
 431
 432           /* Read x[7] sample */
 433           x3 = *(px++);
 434
 435           /* Perform the multiply-accumulates */
 436           /* acc0 +=  x[4] * y[srcBLen - 5] */
 437           acc0 += ((q31_t) x0 * c0);
 438           /* acc1 +=  x[5] * y[srcBLen - 5] */
 439           acc1 += ((q31_t) x1 * c0);
 440           /* acc2 +=  x[6] * y[srcBLen - 5] */
 441           acc2 += ((q31_t) x2 * c0);
 442           /* acc3 +=  x[7] * y[srcBLen - 5] */
 443           acc3 += ((q31_t) x3 * c0);
 444
 445           /* Reuse the present samples for the next MAC */
 446           x0 = x1;
 447           x1 = x2;
 448           x2 = x3;
 449
 450           /* Decrement the loop counter */
 451           k--;
 452         }
 453
 454         /* Store the result in the accumulator in the destination buffer. */
 455         *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
 456         *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
 457         *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
 458         *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
 459
 460         /* Increment the pointer pIn1 index, count by 4 */
 461         count += 4u;
 462
 463         /* Update the inputA and inputB pointers for next MAC calculation */
 464         px = pIn1 + count;
 465         py = pSrc2;
 466
 467
 468         /* Decrement the loop counter */
 469         blkCnt--;
 470       }
 471
 472       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 473        ** No loop unrolling is used. */
 474       blkCnt = (uint32_t) blockSize2 % 0x4u;
 475
 476       while(blkCnt > 0u)
 477       {
 478         /* Accumulator is made zero for every iteration */
 479         sum = 0;
 480
 481         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 482         k = srcBLen >> 2u;
 483
 484         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 485          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 486         while(k > 0u)
 487         {
 488
 489           /* Reading two inputs of SrcA buffer and packing */
 490           in1 = (q15_t) * px++;
 491           in2 = (q15_t) * px++;
 492           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 493
 494           /* Reading two inputs of SrcB buffer and packing */
 495           in1 = (q15_t) * py--;
 496           in2 = (q15_t) * py--;
 497           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 498
 499           /* Perform the multiply-accumulates */
 500           sum = __SMLAD(input1, input2, sum);
 501
 502           /* Reading two inputs of SrcA buffer and packing */
 503           in1 = (q15_t) * px++;
 504           in2 = (q15_t) * px++;
 505           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 506
 507           /* Reading two inputs of SrcB buffer and packing */
 508           in1 = (q15_t) * py--;
 509           in2 = (q15_t) * py--;
 510           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 511
 512           /* Perform the multiply-accumulates */
 513           sum = __SMLAD(input1, input2, sum);
 514
 515           /* Decrement the loop counter */
 516           k--;
 517         }
 518
 519         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 520          ** No loop unrolling is used. */
 521         k = srcBLen % 0x4u;
 522
 523         while(k > 0u)
 524         {
 525           /* Perform the multiply-accumulates */
 526           sum += ((q31_t) * px++ * *py--);
 527
 528           /* Decrement the loop counter */
 529           k--;
 530         }
 531
 532         /* Store the result in the accumulator in the destination buffer. */
 533         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
 534
 535         /* Increment the pointer pIn1 index, count by 1 */
 536             count++;
 537
 538         /* Update the inputA and inputB pointers for next MAC calculation */
 539         px = pIn1 + count;
 540         py = pSrc2;
 541
 542         /* Decrement the loop counter */
 543         blkCnt--;
 544       }
 545     }
 546     else
 547     {
 548       /* If the srcBLen is not a multiple of 4,
 549        * the blockSize2 loop cannot be unrolled by 4 */
 550       blkCnt = (uint32_t) blockSize2;
 551
 552       while(blkCnt > 0u)
 553       {
 554         /* Accumulator is made zero for every iteration */
 555         sum = 0;
 556
 557         /* srcBLen number of MACS should be performed */
 558         k = srcBLen;
 559
 560         while(k > 0u)
 561         {
 562           /* Perform the multiply-accumulate */
 563           sum += ((q31_t) * px++ * *py--);
 564
 565           /* Decrement the loop counter */
 566           k--;
 567         }
 568
 569         /* Store the result in the accumulator in the destination buffer. */
 570         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
 571
 572         /* Increment the MAC count */
 573         count++;
 574
 575         /* Update the inputA and inputB pointers for next MAC calculation */
 576         px = pIn1 + count;
 577         py = pSrc2;
 578
 579         /* Decrement the loop counter */
 580         blkCnt--;
 581       }
 582     }
 583
 584
 585     /* --------------------------
 586      * Initializations of stage3
 587      * -------------------------*/
 588
 589     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 590      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 591      * ....
 592      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 593      * sum +=  x[srcALen-1] * y[srcBLen-1]
 594      */
 595
 596     /* In this stage the MAC operations are decreased by 1 for every iteration.
 597        The count variable holds the number of MAC operations performed */
 598     count = srcBLen - 1u;
 599
 600     /* Working pointer of inputA */
 601     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 602     px = pSrc1;
 603
 604     /* Working pointer of inputB */
 605     pSrc2 = pIn2 + (srcBLen - 1u);
 606     py = pSrc2;
 607
 608     /* -------------------
 609      * Stage3 process
 610      * ------------------*/
 611
 612     while(blockSize3 > 0)
 613     {
 614       /* Accumulator is made zero for every iteration */
 615       sum = 0;
 616
 617       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 618       k = count >> 2u;
 619
 620       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 621        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 622       while(k > 0u)
 623       {
 624         /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
 625         in1 = (q15_t) * px++;
 626         in2 = (q15_t) * px++;
 627         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 628
 629         /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
 630         in1 = (q15_t) * py--;
 631         in2 = (q15_t) * py--;
 632         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 633
 634         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
 635         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
 636         sum = __SMLAD(input1, input2, sum);
 637
 638         /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
 639         in1 = (q15_t) * px++;
 640         in2 = (q15_t) * px++;
 641         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 642
 643         /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
 644         in1 = (q15_t) * py--;
 645         in2 = (q15_t) * py--;
 646         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 647
 648         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
 649         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
 650         sum = __SMLAD(input1, input2, sum);
 651
 652         /* Decrement the loop counter */
 653         k--;
 654       }
 655
 656       /* If the count is not a multiple of 4, compute any remaining MACs here.
 657        ** No loop unrolling is used. */
 658       k = count % 0x4u;
 659
 660       while(k > 0u)
 661       {
 662         /* Perform the multiply-accumulates */
 663         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 664         sum += ((q31_t) * px++ * *py--);
 665
 666         /* Decrement the loop counter */
 667         k--;
 668       }
 669
 670       /* Store the result in the accumulator in the destination buffer. */
 671       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
 672
 673       /* Update the inputA and inputB pointers for next MAC calculation */
 674       px = ++pSrc1;
 675       py = pSrc2;
 676
 677       /* Decrement the MAC count */
 678       count--;
 679
 680       /* Decrement the loop counter */
 681       blockSize3--;
 682
 683     }
 684
 685     /* set status as ARM_MATH_SUCCESS */
 686     status = ARM_MATH_SUCCESS;
 687   }
 688
 689   /* Return to application */
 690   return (status);
 691
 692 #else
 693
 694   /* Run the below code for Cortex-M0 */
 695
 696   q7_t *pIn1 = pSrcA;                            /* inputA pointer */
 697   q7_t *pIn2 = pSrcB;                            /* inputB pointer */
 698   q31_t sum;                                     /* Accumulator */
 699   uint32_t i, j;                                 /* loop counters */
 700   arm_status status;                             /* status of Partial convolution */
 701
 702   /* Check for range of output samples to be calculated */
 703   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 704   {
 705     /* Set status as ARM_ARGUMENT_ERROR */
 706     status = ARM_MATH_ARGUMENT_ERROR;
 707   }
 708   else
 709   {
 710     /* Loop to calculate convolution for output length number of values */
 711     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
 712     {
 713       /* Initialize sum with zero to carry on MAC operations */
 714       sum = 0;
 715
 716       /* Loop to perform MAC operations according to convolution equation */
 717       for (j = 0; j <= i; j++)
 718       {
 719         /* Check the array limitations */
 720         if(((i - j) < srcBLen) && (j < srcALen))
 721         {
 722           /* z[i] += x[i-j] * y[j] */
 723           sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
 724         }
 725       }
 726
 727       /* Store the output in the destination buffer */
 728       pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
 729     }
 730     /* set status as ARM_SUCCESS as there are no argument errors */
 731     status = ARM_MATH_SUCCESS;
 732   }
 733   return (status);
 734
 735 #endif /*  #ifndef ARM_MATH_CM0_FAMILY */
 736
 737 }
 738
 739 /**
 740  * @} end of PartialConv group
 741  */