1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
4 * $Date: 19. March 2015
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_opt_q7.c
10 * Description: Partial convolution of Q7 sequences.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
48 * @addtogroup PartialConv
53 * @brief Partial convolution of Q7 sequences.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written.
59 * @param[in] firstIndex is the first output sample to start with.
60 * @param[in] numPoints is the number of output points to be computed.
61 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
62 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
63 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
66 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
67 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
74 #ifndef UNALIGNED_SUPPORT_DISABLE
76 arm_status
arm_conv_partial_opt_q7(
88 q15_t
*pScr2
, *pScr1
; /* Intermediate pointers for scratch pointers */
89 q15_t x4
; /* Temporary input variable */
90 q7_t
*pIn1
, *pIn2
; /* inputA and inputB pointer */
91 uint32_t j
, k
, blkCnt
, tapCnt
; /* loop counter */
92 q7_t
*px
; /* Temporary input1 pointer */
93 q15_t
*py
; /* Temporary input2 pointer */
94 q31_t acc0
, acc1
, acc2
, acc3
; /* Accumulator */
95 q31_t x1
, x2
, x3
, y1
; /* Temporary input variables */
97 q7_t
*pOut
= pDst
; /* output pointer */
98 q7_t out0
, out1
, out2
, out3
; /* temporary variables */
100 /* Check for range of output samples to be calculated */
101 if((firstIndex
+ numPoints
) > ((srcALen
+ (srcBLen
- 1u))))
103 /* Set status as ARM_MATH_ARGUMENT_ERROR */
104 status
= ARM_MATH_ARGUMENT_ERROR
;
109 /* The algorithm implementation is based on the lengths of the inputs. */
110 /* srcB is always made to slide across srcA. */
111 /* So srcBLen is always considered as shorter or equal to srcALen */
112 if(srcALen
>= srcBLen
)
114 /* Initialization of inputA pointer */
117 /* Initialization of inputB pointer */
122 /* Initialization of inputA pointer */
125 /* Initialization of inputB pointer */
128 /* srcBLen is always considered as shorter or equal to srcALen */
134 /* pointer to take end of scratch2 buffer */
137 /* points to smaller length sequence */
138 px
= pIn2
+ srcBLen
- 1;
140 /* Apply loop unrolling and do 4 Copies simultaneously. */
143 /* First part of the processing with loop unrolling copies 4 data points at a time.
144 ** a second loop below copies for the remaining 1 to 3 samples. */
147 /* copy second buffer in reversal manner */
157 /* Decrement the loop counter */
161 /* If the count is not a multiple of 4, copy remaining samples here.
162 ** No loop unrolling is used. */
167 /* copy second buffer in reversal manner for remaining samples */
171 /* Decrement the loop counter */
175 /* Initialze temporary scratch pointer */
178 /* Fill (srcBLen - 1u) zeros in scratch buffer */
179 arm_fill_q15(0, pScr1
, (srcBLen
- 1u));
181 /* Update temporary scratch pointer */
182 pScr1
+= (srcBLen
- 1u);
184 /* Copy (srcALen) samples in scratch buffer */
185 /* Apply loop unrolling and do 4 Copies simultaneously. */
188 /* First part of the processing with loop unrolling copies 4 data points at a time.
189 ** a second loop below copies for the remaining 1 to 3 samples. */
192 /* copy second buffer in reversal manner */
193 x4
= (q15_t
) * pIn1
++;
195 x4
= (q15_t
) * pIn1
++;
197 x4
= (q15_t
) * pIn1
++;
199 x4
= (q15_t
) * pIn1
++;
202 /* Decrement the loop counter */
206 /* If the count is not a multiple of 4, copy remaining samples here.
207 ** No loop unrolling is used. */
212 /* copy second buffer in reversal manner for remaining samples */
213 x4
= (q15_t
) * pIn1
++;
216 /* Decrement the loop counter */
220 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
221 arm_fill_q15(0, pScr1
, (srcBLen
- 1u));
224 pScr1
+= (srcBLen
- 1u);
227 /* Temporary pointer for scratch2 */
230 /* Initialization of pIn2 pointer */
235 pOut
= pDst
+ firstIndex
;
237 pScratch1
+= firstIndex
;
239 /* Actual convolution process starts here */
240 blkCnt
= (numPoints
) >> 2;
245 /* Initialze temporary scratch pointer as scratch1 */
248 /* Clear Accumlators */
254 /* Read two samples from scratch1 buffer */
255 x1
= *__SIMD32(pScr1
)++;
257 /* Read next two samples from scratch1 buffer */
258 x2
= *__SIMD32(pScr1
)++;
260 tapCnt
= (srcBLen
) >> 2u;
265 /* Read four samples from smaller buffer */
266 y1
= _SIMD32_OFFSET(pScr2
);
268 /* multiply and accumlate */
269 acc0
= __SMLAD(x1
, y1
, acc0
);
270 acc2
= __SMLAD(x2
, y1
, acc2
);
272 /* pack input data */
273 #ifndef ARM_MATH_BIG_ENDIAN
274 x3
= __PKHBT(x2
, x1
, 0);
276 x3
= __PKHBT(x1
, x2
, 0);
279 /* multiply and accumlate */
280 acc1
= __SMLADX(x3
, y1
, acc1
);
282 /* Read next two samples from scratch1 buffer */
283 x1
= *__SIMD32(pScr1
)++;
285 /* pack input data */
286 #ifndef ARM_MATH_BIG_ENDIAN
287 x3
= __PKHBT(x1
, x2
, 0);
289 x3
= __PKHBT(x2
, x1
, 0);
292 acc3
= __SMLADX(x3
, y1
, acc3
);
294 /* Read four samples from smaller buffer */
295 y1
= _SIMD32_OFFSET(pScr2
+ 2u);
297 acc0
= __SMLAD(x2
, y1
, acc0
);
299 acc2
= __SMLAD(x1
, y1
, acc2
);
301 acc1
= __SMLADX(x3
, y1
, acc1
);
303 x2
= *__SIMD32(pScr1
)++;
305 #ifndef ARM_MATH_BIG_ENDIAN
306 x3
= __PKHBT(x2
, x1
, 0);
308 x3
= __PKHBT(x1
, x2
, 0);
311 acc3
= __SMLADX(x3
, y1
, acc3
);
316 /* Decrement the loop counter */
322 /* Update scratch pointer for remaining samples of smaller length sequence */
326 /* apply same above for remaining samples of smaller length sequence */
327 tapCnt
= (srcBLen
) & 3u;
332 /* accumlate the results */
333 acc0
+= (*pScr1
++ * *pScr2
);
334 acc1
+= (*pScr1
++ * *pScr2
);
335 acc2
+= (*pScr1
++ * *pScr2
);
336 acc3
+= (*pScr1
++ * *pScr2
++);
340 /* Decrement the loop counter */
346 /* Store the result in the accumulator in the destination buffer. */
347 out0
= (q7_t
) (__SSAT(acc0
>> 7u, 8));
348 out1
= (q7_t
) (__SSAT(acc1
>> 7u, 8));
349 out2
= (q7_t
) (__SSAT(acc2
>> 7u, 8));
350 out3
= (q7_t
) (__SSAT(acc3
>> 7u, 8));
352 *__SIMD32(pOut
)++ = __PACKq7(out0
, out1
, out2
, out3
);
354 /* Initialization of inputB pointer */
361 blkCnt
= (numPoints
) & 0x3;
363 /* Calculate convolution for remaining samples of Bigger length sequence */
366 /* Initialze temporary scratch pointer as scratch1 */
369 /* Clear Accumlators */
372 tapCnt
= (srcBLen
) >> 1u;
377 /* Read next two samples from scratch1 buffer */
378 x1
= *__SIMD32(pScr1
)++;
380 /* Read two samples from smaller buffer */
381 y1
= *__SIMD32(pScr2
)++;
383 acc0
= __SMLAD(x1
, y1
, acc0
);
385 /* Decrement the loop counter */
389 tapCnt
= (srcBLen
) & 1u;
391 /* apply same above for remaining samples of smaller length sequence */
395 /* accumlate the results */
396 acc0
+= (*pScr1
++ * *pScr2
++);
398 /* Decrement the loop counter */
404 /* Store the result in the accumulator in the destination buffer. */
405 *pOut
++ = (q7_t
) (__SSAT(acc0
>> 7u, 8));
407 /* Initialization of inputB pointer */
414 /* set status as ARM_MATH_SUCCESS */
415 status
= ARM_MATH_SUCCESS
;
426 arm_status
arm_conv_partial_opt_q7(
438 q15_t
*pScr2
, *pScr1
; /* Intermediate pointers for scratch pointers */
439 q15_t x4
; /* Temporary input variable */
440 q7_t
*pIn1
, *pIn2
; /* inputA and inputB pointer */
441 uint32_t j
, k
, blkCnt
, tapCnt
; /* loop counter */
442 q7_t
*px
; /* Temporary input1 pointer */
443 q15_t
*py
; /* Temporary input2 pointer */
444 q31_t acc0
, acc1
, acc2
, acc3
; /* Accumulator */
446 q7_t
*pOut
= pDst
; /* output pointer */
447 q15_t x10
, x11
, x20
, x21
; /* Temporary input variables */
448 q15_t y10
, y11
; /* Temporary input variables */
450 /* Check for range of output samples to be calculated */
451 if((firstIndex
+ numPoints
) > ((srcALen
+ (srcBLen
- 1u))))
453 /* Set status as ARM_MATH_ARGUMENT_ERROR */
454 status
= ARM_MATH_ARGUMENT_ERROR
;
459 /* The algorithm implementation is based on the lengths of the inputs. */
460 /* srcB is always made to slide across srcA. */
461 /* So srcBLen is always considered as shorter or equal to srcALen */
462 if(srcALen
>= srcBLen
)
464 /* Initialization of inputA pointer */
467 /* Initialization of inputB pointer */
472 /* Initialization of inputA pointer */
475 /* Initialization of inputB pointer */
478 /* srcBLen is always considered as shorter or equal to srcALen */
484 /* pointer to take end of scratch2 buffer */
487 /* points to smaller length sequence */
488 px
= pIn2
+ srcBLen
- 1;
490 /* Apply loop unrolling and do 4 Copies simultaneously. */
493 /* First part of the processing with loop unrolling copies 4 data points at a time.
494 ** a second loop below copies for the remaining 1 to 3 samples. */
497 /* copy second buffer in reversal manner */
507 /* Decrement the loop counter */
511 /* If the count is not a multiple of 4, copy remaining samples here.
512 ** No loop unrolling is used. */
517 /* copy second buffer in reversal manner for remaining samples */
521 /* Decrement the loop counter */
525 /* Initialze temporary scratch pointer */
528 /* Fill (srcBLen - 1u) zeros in scratch buffer */
529 arm_fill_q15(0, pScr1
, (srcBLen
- 1u));
531 /* Update temporary scratch pointer */
532 pScr1
+= (srcBLen
- 1u);
534 /* Copy (srcALen) samples in scratch buffer */
535 /* Apply loop unrolling and do 4 Copies simultaneously. */
538 /* First part of the processing with loop unrolling copies 4 data points at a time.
539 ** a second loop below copies for the remaining 1 to 3 samples. */
542 /* copy second buffer in reversal manner */
543 x4
= (q15_t
) * pIn1
++;
545 x4
= (q15_t
) * pIn1
++;
547 x4
= (q15_t
) * pIn1
++;
549 x4
= (q15_t
) * pIn1
++;
552 /* Decrement the loop counter */
556 /* If the count is not a multiple of 4, copy remaining samples here.
557 ** No loop unrolling is used. */
562 /* copy second buffer in reversal manner for remaining samples */
563 x4
= (q15_t
) * pIn1
++;
566 /* Decrement the loop counter */
570 /* Apply loop unrolling and do 4 Copies simultaneously. */
571 k
= (srcBLen
- 1u) >> 2u;
573 /* First part of the processing with loop unrolling copies 4 data points at a time.
574 ** a second loop below copies for the remaining 1 to 3 samples. */
577 /* copy second buffer in reversal manner */
583 /* Decrement the loop counter */
587 /* If the count is not a multiple of 4, copy remaining samples here.
588 ** No loop unrolling is used. */
589 k
= (srcBLen
- 1u) % 0x4u
;
593 /* copy second buffer in reversal manner for remaining samples */
596 /* Decrement the loop counter */
601 /* Temporary pointer for scratch2 */
604 /* Initialization of pIn2 pointer */
609 pOut
= pDst
+ firstIndex
;
611 pScratch1
+= firstIndex
;
613 /* Actual convolution process starts here */
614 blkCnt
= (numPoints
) >> 2;
619 /* Initialze temporary scratch pointer as scratch1 */
622 /* Clear Accumlators */
628 /* Read two samples from scratch1 buffer */
632 /* Read next two samples from scratch1 buffer */
636 tapCnt
= (srcBLen
) >> 2u;
641 /* Read four samples from smaller buffer */
645 /* multiply and accumlate */
646 acc0
+= (q31_t
) x10
*y10
;
647 acc0
+= (q31_t
) x11
*y11
;
648 acc2
+= (q31_t
) x20
*y10
;
649 acc2
+= (q31_t
) x21
*y11
;
652 acc1
+= (q31_t
) x11
*y10
;
653 acc1
+= (q31_t
) x20
*y11
;
655 /* Read next two samples from scratch1 buffer */
659 /* multiply and accumlate */
660 acc3
+= (q31_t
) x21
*y10
;
661 acc3
+= (q31_t
) x10
*y11
;
663 /* Read next two samples from scratch2 buffer */
667 /* multiply and accumlate */
668 acc0
+= (q31_t
) x20
*y10
;
669 acc0
+= (q31_t
) x21
*y11
;
670 acc2
+= (q31_t
) x10
*y10
;
671 acc2
+= (q31_t
) x11
*y11
;
672 acc1
+= (q31_t
) x21
*y10
;
673 acc1
+= (q31_t
) x10
*y11
;
675 /* Read next two samples from scratch1 buffer */
679 /* multiply and accumlate */
680 acc3
+= (q31_t
) x11
*y10
;
681 acc3
+= (q31_t
) x20
*y11
;
683 /* update scratch pointers */
688 /* Decrement the loop counter */
694 /* Update scratch pointer for remaining samples of smaller length sequence */
698 /* apply same above for remaining samples of smaller length sequence */
699 tapCnt
= (srcBLen
) & 3u;
704 /* accumlate the results */
705 acc0
+= (*pScr1
++ * *pScr2
);
706 acc1
+= (*pScr1
++ * *pScr2
);
707 acc2
+= (*pScr1
++ * *pScr2
);
708 acc3
+= (*pScr1
++ * *pScr2
++);
712 /* Decrement the loop counter */
718 /* Store the result in the accumulator in the destination buffer. */
719 *pOut
++ = (q7_t
) (__SSAT(acc0
>> 7u, 8));
720 *pOut
++ = (q7_t
) (__SSAT(acc1
>> 7u, 8));
721 *pOut
++ = (q7_t
) (__SSAT(acc2
>> 7u, 8));
722 *pOut
++ = (q7_t
) (__SSAT(acc3
>> 7u, 8));
724 /* Initialization of inputB pointer */
731 blkCnt
= (numPoints
) & 0x3;
733 /* Calculate convolution for remaining samples of Bigger length sequence */
736 /* Initialze temporary scratch pointer as scratch1 */
739 /* Clear Accumlators */
742 tapCnt
= (srcBLen
) >> 1u;
747 /* Read next two samples from scratch1 buffer */
751 /* Read two samples from smaller buffer */
755 /* multiply and accumlate */
756 acc0
+= (q31_t
) x10
*y10
;
757 acc0
+= (q31_t
) x11
*y11
;
759 /* Decrement the loop counter */
763 tapCnt
= (srcBLen
) & 1u;
765 /* apply same above for remaining samples of smaller length sequence */
769 /* accumlate the results */
770 acc0
+= (*pScr1
++ * *pScr2
++);
772 /* Decrement the loop counter */
778 /* Store the result in the accumulator in the destination buffer. */
779 *pOut
++ = (q7_t
) (__SSAT(acc0
>> 7u, 8));
781 /* Initialization of inputB pointer */
788 /* set status as ARM_MATH_SUCCESS */
789 status
= ARM_MATH_SUCCESS
;
797 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
802 * @} end of PartialConv group