lib/main/STM32F3/Drivers/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q7.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        19. March 2015
   5 * $Revision:    V.1.4.5
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_opt_q7.c
   9 *
  10 * Description:  Partial convolution of Q7 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup PartialConv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Partial convolution of Q7 sequences.
  54  * @param[in]       *pSrcA points to the first input sequence.
  55  * @param[in]       srcALen length of the first input sequence.
  56  * @param[in]       *pSrcB points to the second input sequence.
  57  * @param[in]       srcBLen length of the second input sequence.
  58  * @param[out]      *pDst points to the location where the output result is written.
  59  * @param[in]       firstIndex is the first output sample to start with.
  60  * @param[in]       numPoints is the number of output points to be computed.
  61  * @param[in]      *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
  62  * @param[in]      *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
  63  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  64  *
  65  * \par Restrictions
  66  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  67  *      In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
  68  *
  69  *
  70  *
  71  */
  72
  73
  74 #ifndef UNALIGNED_SUPPORT_DISABLE
  75
  76 arm_status arm_conv_partial_opt_q7(
  77   q7_t * pSrcA,
  78   uint32_t srcALen,
  79   q7_t * pSrcB,
  80   uint32_t srcBLen,
  81   q7_t * pDst,
  82   uint32_t firstIndex,
  83   uint32_t numPoints,
  84   q15_t * pScratch1,
  85   q15_t * pScratch2)
  86 {
  87
  88   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
  89   q15_t x4;                                      /* Temporary input variable */
  90   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
  91   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
  92   q7_t *px;                                      /* Temporary input1 pointer */
  93   q15_t *py;                                     /* Temporary input2 pointer */
  94   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
  95   q31_t x1, x2, x3, y1;                          /* Temporary input variables */
  96   arm_status status;
  97   q7_t *pOut = pDst;                             /* output pointer */
  98   q7_t out0, out1, out2, out3;                   /* temporary variables */
  99
 100   /* Check for range of output samples to be calculated */
 101   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 102   {
 103     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 104     status = ARM_MATH_ARGUMENT_ERROR;
 105   }
 106   else
 107   {
 108
 109     /* The algorithm implementation is based on the lengths of the inputs. */
 110     /* srcB is always made to slide across srcA. */
 111     /* So srcBLen is always considered as shorter or equal to srcALen */
 112     if(srcALen >= srcBLen)
 113     {
 114       /* Initialization of inputA pointer */
 115       pIn1 = pSrcA;
 116
 117       /* Initialization of inputB pointer */
 118       pIn2 = pSrcB;
 119     }
 120     else
 121     {
 122       /* Initialization of inputA pointer */
 123       pIn1 = pSrcB;
 124
 125       /* Initialization of inputB pointer */
 126       pIn2 = pSrcA;
 127
 128       /* srcBLen is always considered as shorter or equal to srcALen */
 129       j = srcBLen;
 130       srcBLen = srcALen;
 131       srcALen = j;
 132     }
 133
 134     /* pointer to take end of scratch2 buffer */
 135     pScr2 = pScratch2;
 136
 137     /* points to smaller length sequence */
 138     px = pIn2 + srcBLen - 1;
 139
 140     /* Apply loop unrolling and do 4 Copies simultaneously. */
 141     k = srcBLen >> 2u;
 142
 143     /* First part of the processing with loop unrolling copies 4 data points at a time.
 144      ** a second loop below copies for the remaining 1 to 3 samples. */
 145     while(k > 0u)
 146     {
 147       /* copy second buffer in reversal manner */
 148       x4 = (q15_t) * px--;
 149       *pScr2++ = x4;
 150       x4 = (q15_t) * px--;
 151       *pScr2++ = x4;
 152       x4 = (q15_t) * px--;
 153       *pScr2++ = x4;
 154       x4 = (q15_t) * px--;
 155       *pScr2++ = x4;
 156
 157       /* Decrement the loop counter */
 158       k--;
 159     }
 160
 161     /* If the count is not a multiple of 4, copy remaining samples here.
 162      ** No loop unrolling is used. */
 163     k = srcBLen % 0x4u;
 164
 165     while(k > 0u)
 166     {
 167       /* copy second buffer in reversal manner for remaining samples */
 168       x4 = (q15_t) * px--;
 169       *pScr2++ = x4;
 170
 171       /* Decrement the loop counter */
 172       k--;
 173     }
 174
 175     /* Initialze temporary scratch pointer */
 176     pScr1 = pScratch1;
 177
 178     /* Fill (srcBLen - 1u) zeros in scratch buffer */
 179     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 180
 181     /* Update temporary scratch pointer */
 182     pScr1 += (srcBLen - 1u);
 183
 184     /* Copy (srcALen) samples in scratch buffer */
 185     /* Apply loop unrolling and do 4 Copies simultaneously. */
 186     k = srcALen >> 2u;
 187
 188     /* First part of the processing with loop unrolling copies 4 data points at a time.
 189      ** a second loop below copies for the remaining 1 to 3 samples. */
 190     while(k > 0u)
 191     {
 192       /* copy second buffer in reversal manner */
 193       x4 = (q15_t) * pIn1++;
 194       *pScr1++ = x4;
 195       x4 = (q15_t) * pIn1++;
 196       *pScr1++ = x4;
 197       x4 = (q15_t) * pIn1++;
 198       *pScr1++ = x4;
 199       x4 = (q15_t) * pIn1++;
 200       *pScr1++ = x4;
 201
 202       /* Decrement the loop counter */
 203       k--;
 204     }
 205
 206     /* If the count is not a multiple of 4, copy remaining samples here.
 207      ** No loop unrolling is used. */
 208     k = srcALen % 0x4u;
 209
 210     while(k > 0u)
 211     {
 212       /* copy second buffer in reversal manner for remaining samples */
 213       x4 = (q15_t) * pIn1++;
 214       *pScr1++ = x4;
 215
 216       /* Decrement the loop counter */
 217       k--;
 218     }
 219
 220     /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
 221     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 222
 223     /* Update pointer */
 224     pScr1 += (srcBLen - 1u);
 225
 226
 227     /* Temporary pointer for scratch2 */
 228     py = pScratch2;
 229
 230     /* Initialization of pIn2 pointer */
 231     pIn2 = (q7_t *) py;
 232
 233     pScr2 = py;
 234
 235     pOut = pDst + firstIndex;
 236
 237     pScratch1 += firstIndex;
 238
 239     /* Actual convolution process starts here */
 240     blkCnt = (numPoints) >> 2;
 241
 242
 243     while(blkCnt > 0)
 244     {
 245       /* Initialze temporary scratch pointer as scratch1 */
 246       pScr1 = pScratch1;
 247
 248       /* Clear Accumlators */
 249       acc0 = 0;
 250       acc1 = 0;
 251       acc2 = 0;
 252       acc3 = 0;
 253
 254       /* Read two samples from scratch1 buffer */
 255       x1 = *__SIMD32(pScr1)++;
 256
 257       /* Read next two samples from scratch1 buffer */
 258       x2 = *__SIMD32(pScr1)++;
 259
 260       tapCnt = (srcBLen) >> 2u;
 261
 262       while(tapCnt > 0u)
 263       {
 264
 265         /* Read four samples from smaller buffer */
 266         y1 = _SIMD32_OFFSET(pScr2);
 267
 268         /* multiply and accumlate */
 269         acc0 = __SMLAD(x1, y1, acc0);
 270         acc2 = __SMLAD(x2, y1, acc2);
 271
 272         /* pack input data */
 273 #ifndef ARM_MATH_BIG_ENDIAN
 274         x3 = __PKHBT(x2, x1, 0);
 275 #else
 276         x3 = __PKHBT(x1, x2, 0);
 277 #endif
 278
 279         /* multiply and accumlate */
 280         acc1 = __SMLADX(x3, y1, acc1);
 281
 282         /* Read next two samples from scratch1 buffer */
 283         x1 = *__SIMD32(pScr1)++;
 284
 285         /* pack input data */
 286 #ifndef ARM_MATH_BIG_ENDIAN
 287         x3 = __PKHBT(x1, x2, 0);
 288 #else
 289         x3 = __PKHBT(x2, x1, 0);
 290 #endif
 291
 292         acc3 = __SMLADX(x3, y1, acc3);
 293
 294         /* Read four samples from smaller buffer */
 295         y1 = _SIMD32_OFFSET(pScr2 + 2u);
 296
 297         acc0 = __SMLAD(x2, y1, acc0);
 298
 299         acc2 = __SMLAD(x1, y1, acc2);
 300
 301         acc1 = __SMLADX(x3, y1, acc1);
 302
 303         x2 = *__SIMD32(pScr1)++;
 304
 305 #ifndef ARM_MATH_BIG_ENDIAN
 306         x3 = __PKHBT(x2, x1, 0);
 307 #else
 308         x3 = __PKHBT(x1, x2, 0);
 309 #endif
 310
 311         acc3 = __SMLADX(x3, y1, acc3);
 312
 313         pScr2 += 4u;
 314
 315
 316         /* Decrement the loop counter */
 317         tapCnt--;
 318       }
 319
 320
 321
 322       /* Update scratch pointer for remaining samples of smaller length sequence */
 323       pScr1 -= 4u;
 324
 325
 326       /* apply same above for remaining samples of smaller length sequence */
 327       tapCnt = (srcBLen) & 3u;
 328
 329       while(tapCnt > 0u)
 330       {
 331
 332         /* accumlate the results */
 333         acc0 += (*pScr1++ * *pScr2);
 334         acc1 += (*pScr1++ * *pScr2);
 335         acc2 += (*pScr1++ * *pScr2);
 336         acc3 += (*pScr1++ * *pScr2++);
 337
 338         pScr1 -= 3u;
 339
 340         /* Decrement the loop counter */
 341         tapCnt--;
 342       }
 343
 344       blkCnt--;
 345
 346       /* Store the result in the accumulator in the destination buffer. */
 347       out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
 348       out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
 349       out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
 350       out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
 351
 352       *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
 353
 354       /* Initialization of inputB pointer */
 355       pScr2 = py;
 356
 357       pScratch1 += 4u;
 358
 359     }
 360
 361     blkCnt = (numPoints) & 0x3;
 362
 363     /* Calculate convolution for remaining samples of Bigger length sequence */
 364     while(blkCnt > 0)
 365     {
 366       /* Initialze temporary scratch pointer as scratch1 */
 367       pScr1 = pScratch1;
 368
 369       /* Clear Accumlators */
 370       acc0 = 0;
 371
 372       tapCnt = (srcBLen) >> 1u;
 373
 374       while(tapCnt > 0u)
 375       {
 376
 377         /* Read next two samples from scratch1 buffer */
 378         x1 = *__SIMD32(pScr1)++;
 379
 380         /* Read two samples from smaller buffer */
 381         y1 = *__SIMD32(pScr2)++;
 382
 383         acc0 = __SMLAD(x1, y1, acc0);
 384
 385         /* Decrement the loop counter */
 386         tapCnt--;
 387       }
 388
 389       tapCnt = (srcBLen) & 1u;
 390
 391       /* apply same above for remaining samples of smaller length sequence */
 392       while(tapCnt > 0u)
 393       {
 394
 395         /* accumlate the results */
 396         acc0 += (*pScr1++ * *pScr2++);
 397
 398         /* Decrement the loop counter */
 399         tapCnt--;
 400       }
 401
 402       blkCnt--;
 403
 404       /* Store the result in the accumulator in the destination buffer. */
 405       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
 406
 407       /* Initialization of inputB pointer */
 408       pScr2 = py;
 409
 410       pScratch1 += 1u;
 411
 412     }
 413
 414     /* set status as ARM_MATH_SUCCESS */
 415     status = ARM_MATH_SUCCESS;
 416
 417
 418   }
 419
 420   return (status);
 421
 422 }
 423
 424 #else
 425
 426 arm_status arm_conv_partial_opt_q7(
 427   q7_t * pSrcA,
 428   uint32_t srcALen,
 429   q7_t * pSrcB,
 430   uint32_t srcBLen,
 431   q7_t * pDst,
 432   uint32_t firstIndex,
 433   uint32_t numPoints,
 434   q15_t * pScratch1,
 435   q15_t * pScratch2)
 436 {
 437
 438   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
 439   q15_t x4;                                      /* Temporary input variable */
 440   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
 441   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
 442   q7_t *px;                                      /* Temporary input1 pointer */
 443   q15_t *py;                                     /* Temporary input2 pointer */
 444   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
 445   arm_status status;
 446   q7_t *pOut = pDst;                             /* output pointer */
 447   q15_t x10, x11, x20, x21;                      /* Temporary input variables */
 448   q15_t y10, y11;                                /* Temporary input variables */
 449
 450   /* Check for range of output samples to be calculated */
 451   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 452   {
 453     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 454     status = ARM_MATH_ARGUMENT_ERROR;
 455   }
 456   else
 457   {
 458
 459     /* The algorithm implementation is based on the lengths of the inputs. */
 460     /* srcB is always made to slide across srcA. */
 461     /* So srcBLen is always considered as shorter or equal to srcALen */
 462     if(srcALen >= srcBLen)
 463     {
 464       /* Initialization of inputA pointer */
 465       pIn1 = pSrcA;
 466
 467       /* Initialization of inputB pointer */
 468       pIn2 = pSrcB;
 469     }
 470     else
 471     {
 472       /* Initialization of inputA pointer */
 473       pIn1 = pSrcB;
 474
 475       /* Initialization of inputB pointer */
 476       pIn2 = pSrcA;
 477
 478       /* srcBLen is always considered as shorter or equal to srcALen */
 479       j = srcBLen;
 480       srcBLen = srcALen;
 481       srcALen = j;
 482     }
 483
 484     /* pointer to take end of scratch2 buffer */
 485     pScr2 = pScratch2;
 486
 487     /* points to smaller length sequence */
 488     px = pIn2 + srcBLen - 1;
 489
 490     /* Apply loop unrolling and do 4 Copies simultaneously. */
 491     k = srcBLen >> 2u;
 492
 493     /* First part of the processing with loop unrolling copies 4 data points at a time.
 494      ** a second loop below copies for the remaining 1 to 3 samples. */
 495     while(k > 0u)
 496     {
 497       /* copy second buffer in reversal manner */
 498       x4 = (q15_t) * px--;
 499       *pScr2++ = x4;
 500       x4 = (q15_t) * px--;
 501       *pScr2++ = x4;
 502       x4 = (q15_t) * px--;
 503       *pScr2++ = x4;
 504       x4 = (q15_t) * px--;
 505       *pScr2++ = x4;
 506
 507       /* Decrement the loop counter */
 508       k--;
 509     }
 510
 511     /* If the count is not a multiple of 4, copy remaining samples here.
 512      ** No loop unrolling is used. */
 513     k = srcBLen % 0x4u;
 514
 515     while(k > 0u)
 516     {
 517       /* copy second buffer in reversal manner for remaining samples */
 518       x4 = (q15_t) * px--;
 519       *pScr2++ = x4;
 520
 521       /* Decrement the loop counter */
 522       k--;
 523     }
 524
 525     /* Initialze temporary scratch pointer */
 526     pScr1 = pScratch1;
 527
 528     /* Fill (srcBLen - 1u) zeros in scratch buffer */
 529     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 530
 531     /* Update temporary scratch pointer */
 532     pScr1 += (srcBLen - 1u);
 533
 534     /* Copy (srcALen) samples in scratch buffer */
 535     /* Apply loop unrolling and do 4 Copies simultaneously. */
 536     k = srcALen >> 2u;
 537
 538     /* First part of the processing with loop unrolling copies 4 data points at a time.
 539      ** a second loop below copies for the remaining 1 to 3 samples. */
 540     while(k > 0u)
 541     {
 542       /* copy second buffer in reversal manner */
 543       x4 = (q15_t) * pIn1++;
 544       *pScr1++ = x4;
 545       x4 = (q15_t) * pIn1++;
 546       *pScr1++ = x4;
 547       x4 = (q15_t) * pIn1++;
 548       *pScr1++ = x4;
 549       x4 = (q15_t) * pIn1++;
 550       *pScr1++ = x4;
 551
 552       /* Decrement the loop counter */
 553       k--;
 554     }
 555
 556     /* If the count is not a multiple of 4, copy remaining samples here.
 557      ** No loop unrolling is used. */
 558     k = srcALen % 0x4u;
 559
 560     while(k > 0u)
 561     {
 562       /* copy second buffer in reversal manner for remaining samples */
 563       x4 = (q15_t) * pIn1++;
 564       *pScr1++ = x4;
 565
 566       /* Decrement the loop counter */
 567       k--;
 568     }
 569
 570     /* Apply loop unrolling and do 4 Copies simultaneously. */
 571     k = (srcBLen - 1u) >> 2u;
 572
 573     /* First part of the processing with loop unrolling copies 4 data points at a time.
 574      ** a second loop below copies for the remaining 1 to 3 samples. */
 575     while(k > 0u)
 576     {
 577       /* copy second buffer in reversal manner */
 578       *pScr1++ = 0;
 579       *pScr1++ = 0;
 580       *pScr1++ = 0;
 581       *pScr1++ = 0;
 582
 583       /* Decrement the loop counter */
 584       k--;
 585     }
 586
 587     /* If the count is not a multiple of 4, copy remaining samples here.
 588      ** No loop unrolling is used. */
 589     k = (srcBLen - 1u) % 0x4u;
 590
 591     while(k > 0u)
 592     {
 593       /* copy second buffer in reversal manner for remaining samples */
 594       *pScr1++ = 0;
 595
 596       /* Decrement the loop counter */
 597       k--;
 598     }
 599
 600
 601     /* Temporary pointer for scratch2 */
 602     py = pScratch2;
 603
 604     /* Initialization of pIn2 pointer */
 605     pIn2 = (q7_t *) py;
 606
 607     pScr2 = py;
 608
 609     pOut = pDst + firstIndex;
 610
 611     pScratch1 += firstIndex;
 612
 613     /* Actual convolution process starts here */
 614     blkCnt = (numPoints) >> 2;
 615
 616
 617     while(blkCnt > 0)
 618     {
 619       /* Initialze temporary scratch pointer as scratch1 */
 620       pScr1 = pScratch1;
 621
 622       /* Clear Accumlators */
 623       acc0 = 0;
 624       acc1 = 0;
 625       acc2 = 0;
 626       acc3 = 0;
 627
 628       /* Read two samples from scratch1 buffer */
 629       x10 = *pScr1++;
 630       x11 = *pScr1++;
 631
 632       /* Read next two samples from scratch1 buffer */
 633       x20 = *pScr1++;
 634       x21 = *pScr1++;
 635
 636       tapCnt = (srcBLen) >> 2u;
 637
 638       while(tapCnt > 0u)
 639       {
 640
 641         /* Read four samples from smaller buffer */
 642         y10 = *pScr2;
 643         y11 = *(pScr2 + 1u);
 644
 645         /* multiply and accumlate */
 646         acc0 += (q31_t) x10 *y10;
 647         acc0 += (q31_t) x11 *y11;
 648         acc2 += (q31_t) x20 *y10;
 649         acc2 += (q31_t) x21 *y11;
 650
 651
 652         acc1 += (q31_t) x11 *y10;
 653         acc1 += (q31_t) x20 *y11;
 654
 655         /* Read next two samples from scratch1 buffer */
 656         x10 = *pScr1;
 657         x11 = *(pScr1 + 1u);
 658
 659         /* multiply and accumlate */
 660         acc3 += (q31_t) x21 *y10;
 661         acc3 += (q31_t) x10 *y11;
 662
 663         /* Read next two samples from scratch2 buffer */
 664         y10 = *(pScr2 + 2u);
 665         y11 = *(pScr2 + 3u);
 666
 667         /* multiply and accumlate */
 668         acc0 += (q31_t) x20 *y10;
 669         acc0 += (q31_t) x21 *y11;
 670         acc2 += (q31_t) x10 *y10;
 671         acc2 += (q31_t) x11 *y11;
 672         acc1 += (q31_t) x21 *y10;
 673         acc1 += (q31_t) x10 *y11;
 674
 675         /* Read next two samples from scratch1 buffer */
 676         x20 = *(pScr1 + 2);
 677         x21 = *(pScr1 + 3);
 678
 679         /* multiply and accumlate */
 680         acc3 += (q31_t) x11 *y10;
 681         acc3 += (q31_t) x20 *y11;
 682
 683         /* update scratch pointers */
 684
 685         pScr1 += 4u;
 686         pScr2 += 4u;
 687
 688         /* Decrement the loop counter */
 689         tapCnt--;
 690       }
 691
 692
 693
 694       /* Update scratch pointer for remaining samples of smaller length sequence */
 695       pScr1 -= 4u;
 696
 697
 698       /* apply same above for remaining samples of smaller length sequence */
 699       tapCnt = (srcBLen) & 3u;
 700
 701       while(tapCnt > 0u)
 702       {
 703
 704         /* accumlate the results */
 705         acc0 += (*pScr1++ * *pScr2);
 706         acc1 += (*pScr1++ * *pScr2);
 707         acc2 += (*pScr1++ * *pScr2);
 708         acc3 += (*pScr1++ * *pScr2++);
 709
 710         pScr1 -= 3u;
 711
 712         /* Decrement the loop counter */
 713         tapCnt--;
 714       }
 715
 716       blkCnt--;
 717
 718       /* Store the result in the accumulator in the destination buffer. */
 719       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
 720       *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
 721       *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
 722       *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
 723
 724       /* Initialization of inputB pointer */
 725       pScr2 = py;
 726
 727       pScratch1 += 4u;
 728
 729     }
 730
 731     blkCnt = (numPoints) & 0x3;
 732
 733     /* Calculate convolution for remaining samples of Bigger length sequence */
 734     while(blkCnt > 0)
 735     {
 736       /* Initialze temporary scratch pointer as scratch1 */
 737       pScr1 = pScratch1;
 738
 739       /* Clear Accumlators */
 740       acc0 = 0;
 741
 742       tapCnt = (srcBLen) >> 1u;
 743
 744       while(tapCnt > 0u)
 745       {
 746
 747         /* Read next two samples from scratch1 buffer */
 748         x10 = *pScr1++;
 749         x11 = *pScr1++;
 750
 751         /* Read two samples from smaller buffer */
 752         y10 = *pScr2++;
 753         y11 = *pScr2++;
 754
 755         /* multiply and accumlate */
 756         acc0 += (q31_t) x10 *y10;
 757         acc0 += (q31_t) x11 *y11;
 758
 759         /* Decrement the loop counter */
 760         tapCnt--;
 761       }
 762
 763       tapCnt = (srcBLen) & 1u;
 764
 765       /* apply same above for remaining samples of smaller length sequence */
 766       while(tapCnt > 0u)
 767       {
 768
 769         /* accumlate the results */
 770         acc0 += (*pScr1++ * *pScr2++);
 771
 772         /* Decrement the loop counter */
 773         tapCnt--;
 774       }
 775
 776       blkCnt--;
 777
 778       /* Store the result in the accumulator in the destination buffer. */
 779       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
 780
 781       /* Initialization of inputB pointer */
 782       pScr2 = py;
 783
 784       pScratch1 += 1u;
 785
 786     }
 787
 788     /* set status as ARM_MATH_SUCCESS */
 789     status = ARM_MATH_SUCCESS;
 790
 791   }
 792
 793   return (status);
 794
 795 }
 796
 797 #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
 798
 799
 800
 801 /**
 802  * @} end of PartialConv group
 803  */