F1 and F3 HAL / LL libraries
[betaflight.git] / lib / main / STM32F3 / Drivers / CMSIS / DSP_Lib / Source / FilteringFunctions / arm_correlate_f32.c
blob6a8127b80fd9cc1b0fe78f1b4ced82eb4e882c6c
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_f32.c
9 *
10 * Description: Correlation of floating-point sequences.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
41 #include "arm_math.h"
43 /**
44 * @ingroup groupFilters
47 /**
48 * @defgroup Corr Correlation
50 * Correlation is a mathematical operation that is similar to convolution.
51 * As with convolution, correlation uses two signals to produce a third signal.
52 * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
53 * Correlation is commonly used to measure the similarity between two signals.
54 * It has applications in pattern recognition, cryptanalysis, and searching.
55 * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
56 * Fast versions of the Q15 and Q31 functions are also provided.
58 * \par Algorithm
59 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
60 * The convolution of the two signals is denoted by
61 * <pre>
62 * c[n] = a[n] * b[n]
63 * </pre>
64 * In correlation, one of the signals is flipped in time
65 * <pre>
66 * c[n] = a[n] * b[-n]
67 * </pre>
69 * \par
70 * and this is mathematically defined as
71 * \image html CorrelateEquation.gif
72 * \par
73 * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
74 * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
75 * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
77 * <b>Note</b>
78 * \par
79 * The <code>pDst</code> should be initialized to all zeros before being used.
81 * <b>Fixed-Point Behavior</b>
82 * \par
83 * Correlation requires summing up a large number of intermediate products.
84 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
85 * Refer to the function specific documentation below for further details of the particular algorithm used.
88 * <b>Fast Versions</b>
90 * \par
91 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
92 * the input signals should be scaled down to avoid intermediate overflows.
95 * <b>Opt Versions</b>
97 * \par
98 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
99 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate
102 /**
103 * @addtogroup Corr
104 * @{
106 /**
107 * @brief Correlation of floating-point sequences.
108 * @param[in] *pSrcA points to the first input sequence.
109 * @param[in] srcALen length of the first input sequence.
110 * @param[in] *pSrcB points to the second input sequence.
111 * @param[in] srcBLen length of the second input sequence.
112 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
113 * @return none.
116 void arm_correlate_f32(
117 float32_t * pSrcA,
118 uint32_t srcALen,
119 float32_t * pSrcB,
120 uint32_t srcBLen,
121 float32_t * pDst)
125 #ifndef ARM_MATH_CM0_FAMILY
127 /* Run the below code for Cortex-M4 and Cortex-M3 */
129 float32_t *pIn1; /* inputA pointer */
130 float32_t *pIn2; /* inputB pointer */
131 float32_t *pOut = pDst; /* output pointer */
132 float32_t *px; /* Intermediate inputA pointer */
133 float32_t *py; /* Intermediate inputB pointer */
134 float32_t *pSrc1; /* Intermediate pointers */
135 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
136 float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
137 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
138 int32_t inc = 1; /* Destination address modifier */
141 /* The algorithm implementation is based on the lengths of the inputs. */
142 /* srcB is always made to slide across srcA. */
143 /* So srcBLen is always considered as shorter or equal to srcALen */
144 /* But CORR(x, y) is reverse of CORR(y, x) */
145 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
146 /* and the destination pointer modifier, inc is set to -1 */
147 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
148 /* But to improve the performance,
149 * we assume zeroes in the output instead of zero padding either of the the inputs*/
150 /* If srcALen > srcBLen,
151 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
152 /* If srcALen < srcBLen,
153 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
154 if(srcALen >= srcBLen)
156 /* Initialization of inputA pointer */
157 pIn1 = pSrcA;
159 /* Initialization of inputB pointer */
160 pIn2 = pSrcB;
162 /* Number of output samples is calculated */
163 outBlockSize = (2u * srcALen) - 1u;
165 /* When srcALen > srcBLen, zero padding has to be done to srcB
166 * to make their lengths equal.
167 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
168 * number of output samples are made zero */
169 j = outBlockSize - (srcALen + (srcBLen - 1u));
171 /* Updating the pointer position to non zero value */
172 pOut += j;
174 //while(j > 0u)
175 //{
176 // /* Zero is stored in the destination buffer */
177 // *pOut++ = 0.0f;
179 // /* Decrement the loop counter */
180 // j--;
181 //}
184 else
186 /* Initialization of inputA pointer */
187 pIn1 = pSrcB;
189 /* Initialization of inputB pointer */
190 pIn2 = pSrcA;
192 /* srcBLen is always considered as shorter or equal to srcALen */
193 j = srcBLen;
194 srcBLen = srcALen;
195 srcALen = j;
197 /* CORR(x, y) = Reverse order(CORR(y, x)) */
198 /* Hence set the destination pointer to point to the last output sample */
199 pOut = pDst + ((srcALen + srcBLen) - 2u);
201 /* Destination address modifier is set to -1 */
202 inc = -1;
206 /* The function is internally
207 * divided into three parts according to the number of multiplications that has to be
208 * taken place between inputA samples and inputB samples. In the first part of the
209 * algorithm, the multiplications increase by one for every iteration.
210 * In the second part of the algorithm, srcBLen number of multiplications are done.
211 * In the third part of the algorithm, the multiplications decrease by one
212 * for every iteration.*/
213 /* The algorithm is implemented in three stages.
214 * The loop counters of each stage is initiated here. */
215 blockSize1 = srcBLen - 1u;
216 blockSize2 = srcALen - (srcBLen - 1u);
217 blockSize3 = blockSize1;
219 /* --------------------------
220 * Initializations of stage1
221 * -------------------------*/
223 /* sum = x[0] * y[srcBlen - 1]
224 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
225 * ....
226 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
229 /* In this stage the MAC operations are increased by 1 for every iteration.
230 The count variable holds the number of MAC operations performed */
231 count = 1u;
233 /* Working pointer of inputA */
234 px = pIn1;
236 /* Working pointer of inputB */
237 pSrc1 = pIn2 + (srcBLen - 1u);
238 py = pSrc1;
240 /* ------------------------
241 * Stage1 process
242 * ----------------------*/
244 /* The first stage starts here */
245 while(blockSize1 > 0u)
247 /* Accumulator is made zero for every iteration */
248 sum = 0.0f;
250 /* Apply loop unrolling and compute 4 MACs simultaneously. */
251 k = count >> 2u;
253 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
254 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
255 while(k > 0u)
257 /* x[0] * y[srcBLen - 4] */
258 sum += *px++ * *py++;
259 /* x[1] * y[srcBLen - 3] */
260 sum += *px++ * *py++;
261 /* x[2] * y[srcBLen - 2] */
262 sum += *px++ * *py++;
263 /* x[3] * y[srcBLen - 1] */
264 sum += *px++ * *py++;
266 /* Decrement the loop counter */
267 k--;
270 /* If the count is not a multiple of 4, compute any remaining MACs here.
271 ** No loop unrolling is used. */
272 k = count % 0x4u;
274 while(k > 0u)
276 /* Perform the multiply-accumulate */
277 /* x[0] * y[srcBLen - 1] */
278 sum += *px++ * *py++;
280 /* Decrement the loop counter */
281 k--;
284 /* Store the result in the accumulator in the destination buffer. */
285 *pOut = sum;
286 /* Destination pointer is updated according to the address modifier, inc */
287 pOut += inc;
289 /* Update the inputA and inputB pointers for next MAC calculation */
290 py = pSrc1 - count;
291 px = pIn1;
293 /* Increment the MAC count */
294 count++;
296 /* Decrement the loop counter */
297 blockSize1--;
300 /* --------------------------
301 * Initializations of stage2
302 * ------------------------*/
304 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
305 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
306 * ....
307 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
310 /* Working pointer of inputA */
311 px = pIn1;
313 /* Working pointer of inputB */
314 py = pIn2;
316 /* count is index by which the pointer pIn1 to be incremented */
317 count = 0u;
319 /* -------------------
320 * Stage2 process
321 * ------------------*/
323 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
324 * So, to loop unroll over blockSize2,
325 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
326 if(srcBLen >= 4u)
328 /* Loop unroll over blockSize2, by 4 */
329 blkCnt = blockSize2 >> 2u;
331 while(blkCnt > 0u)
333 /* Set all accumulators to zero */
334 acc0 = 0.0f;
335 acc1 = 0.0f;
336 acc2 = 0.0f;
337 acc3 = 0.0f;
339 /* read x[0], x[1], x[2] samples */
340 x0 = *(px++);
341 x1 = *(px++);
342 x2 = *(px++);
344 /* Apply loop unrolling and compute 4 MACs simultaneously. */
345 k = srcBLen >> 2u;
347 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
348 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
351 /* Read y[0] sample */
352 c0 = *(py++);
354 /* Read x[3] sample */
355 x3 = *(px++);
357 /* Perform the multiply-accumulate */
358 /* acc0 += x[0] * y[0] */
359 acc0 += x0 * c0;
360 /* acc1 += x[1] * y[0] */
361 acc1 += x1 * c0;
362 /* acc2 += x[2] * y[0] */
363 acc2 += x2 * c0;
364 /* acc3 += x[3] * y[0] */
365 acc3 += x3 * c0;
367 /* Read y[1] sample */
368 c0 = *(py++);
370 /* Read x[4] sample */
371 x0 = *(px++);
373 /* Perform the multiply-accumulate */
374 /* acc0 += x[1] * y[1] */
375 acc0 += x1 * c0;
376 /* acc1 += x[2] * y[1] */
377 acc1 += x2 * c0;
378 /* acc2 += x[3] * y[1] */
379 acc2 += x3 * c0;
380 /* acc3 += x[4] * y[1] */
381 acc3 += x0 * c0;
383 /* Read y[2] sample */
384 c0 = *(py++);
386 /* Read x[5] sample */
387 x1 = *(px++);
389 /* Perform the multiply-accumulates */
390 /* acc0 += x[2] * y[2] */
391 acc0 += x2 * c0;
392 /* acc1 += x[3] * y[2] */
393 acc1 += x3 * c0;
394 /* acc2 += x[4] * y[2] */
395 acc2 += x0 * c0;
396 /* acc3 += x[5] * y[2] */
397 acc3 += x1 * c0;
399 /* Read y[3] sample */
400 c0 = *(py++);
402 /* Read x[6] sample */
403 x2 = *(px++);
405 /* Perform the multiply-accumulates */
406 /* acc0 += x[3] * y[3] */
407 acc0 += x3 * c0;
408 /* acc1 += x[4] * y[3] */
409 acc1 += x0 * c0;
410 /* acc2 += x[5] * y[3] */
411 acc2 += x1 * c0;
412 /* acc3 += x[6] * y[3] */
413 acc3 += x2 * c0;
416 } while(--k);
418 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
419 ** No loop unrolling is used. */
420 k = srcBLen % 0x4u;
422 while(k > 0u)
424 /* Read y[4] sample */
425 c0 = *(py++);
427 /* Read x[7] sample */
428 x3 = *(px++);
430 /* Perform the multiply-accumulates */
431 /* acc0 += x[4] * y[4] */
432 acc0 += x0 * c0;
433 /* acc1 += x[5] * y[4] */
434 acc1 += x1 * c0;
435 /* acc2 += x[6] * y[4] */
436 acc2 += x2 * c0;
437 /* acc3 += x[7] * y[4] */
438 acc3 += x3 * c0;
440 /* Reuse the present samples for the next MAC */
441 x0 = x1;
442 x1 = x2;
443 x2 = x3;
445 /* Decrement the loop counter */
446 k--;
449 /* Store the result in the accumulator in the destination buffer. */
450 *pOut = acc0;
451 /* Destination pointer is updated according to the address modifier, inc */
452 pOut += inc;
454 *pOut = acc1;
455 pOut += inc;
457 *pOut = acc2;
458 pOut += inc;
460 *pOut = acc3;
461 pOut += inc;
463 /* Increment the pointer pIn1 index, count by 4 */
464 count += 4u;
466 /* Update the inputA and inputB pointers for next MAC calculation */
467 px = pIn1 + count;
468 py = pIn2;
470 /* Decrement the loop counter */
471 blkCnt--;
474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
475 ** No loop unrolling is used. */
476 blkCnt = blockSize2 % 0x4u;
478 while(blkCnt > 0u)
480 /* Accumulator is made zero for every iteration */
481 sum = 0.0f;
483 /* Apply loop unrolling and compute 4 MACs simultaneously. */
484 k = srcBLen >> 2u;
486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
488 while(k > 0u)
490 /* Perform the multiply-accumulates */
491 sum += *px++ * *py++;
492 sum += *px++ * *py++;
493 sum += *px++ * *py++;
494 sum += *px++ * *py++;
496 /* Decrement the loop counter */
497 k--;
500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
501 ** No loop unrolling is used. */
502 k = srcBLen % 0x4u;
504 while(k > 0u)
506 /* Perform the multiply-accumulate */
507 sum += *px++ * *py++;
509 /* Decrement the loop counter */
510 k--;
513 /* Store the result in the accumulator in the destination buffer. */
514 *pOut = sum;
515 /* Destination pointer is updated according to the address modifier, inc */
516 pOut += inc;
518 /* Increment the pointer pIn1 index, count by 1 */
519 count++;
521 /* Update the inputA and inputB pointers for next MAC calculation */
522 px = pIn1 + count;
523 py = pIn2;
525 /* Decrement the loop counter */
526 blkCnt--;
529 else
531 /* If the srcBLen is not a multiple of 4,
532 * the blockSize2 loop cannot be unrolled by 4 */
533 blkCnt = blockSize2;
535 while(blkCnt > 0u)
537 /* Accumulator is made zero for every iteration */
538 sum = 0.0f;
540 /* Loop over srcBLen */
541 k = srcBLen;
543 while(k > 0u)
545 /* Perform the multiply-accumulate */
546 sum += *px++ * *py++;
548 /* Decrement the loop counter */
549 k--;
552 /* Store the result in the accumulator in the destination buffer. */
553 *pOut = sum;
554 /* Destination pointer is updated according to the address modifier, inc */
555 pOut += inc;
557 /* Increment the pointer pIn1 index, count by 1 */
558 count++;
560 /* Update the inputA and inputB pointers for next MAC calculation */
561 px = pIn1 + count;
562 py = pIn2;
564 /* Decrement the loop counter */
565 blkCnt--;
569 /* --------------------------
570 * Initializations of stage3
571 * -------------------------*/
573 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
574 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
575 * ....
576 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
577 * sum += x[srcALen-1] * y[0]
580 /* In this stage the MAC operations are decreased by 1 for every iteration.
581 The count variable holds the number of MAC operations performed */
582 count = srcBLen - 1u;
584 /* Working pointer of inputA */
585 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
586 px = pSrc1;
588 /* Working pointer of inputB */
589 py = pIn2;
591 /* -------------------
592 * Stage3 process
593 * ------------------*/
595 while(blockSize3 > 0u)
597 /* Accumulator is made zero for every iteration */
598 sum = 0.0f;
600 /* Apply loop unrolling and compute 4 MACs simultaneously. */
601 k = count >> 2u;
603 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
604 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
605 while(k > 0u)
607 /* Perform the multiply-accumulates */
608 /* sum += x[srcALen - srcBLen + 4] * y[3] */
609 sum += *px++ * *py++;
610 /* sum += x[srcALen - srcBLen + 3] * y[2] */
611 sum += *px++ * *py++;
612 /* sum += x[srcALen - srcBLen + 2] * y[1] */
613 sum += *px++ * *py++;
614 /* sum += x[srcALen - srcBLen + 1] * y[0] */
615 sum += *px++ * *py++;
617 /* Decrement the loop counter */
618 k--;
621 /* If the count is not a multiple of 4, compute any remaining MACs here.
622 ** No loop unrolling is used. */
623 k = count % 0x4u;
625 while(k > 0u)
627 /* Perform the multiply-accumulates */
628 sum += *px++ * *py++;
630 /* Decrement the loop counter */
631 k--;
634 /* Store the result in the accumulator in the destination buffer. */
635 *pOut = sum;
636 /* Destination pointer is updated according to the address modifier, inc */
637 pOut += inc;
639 /* Update the inputA and inputB pointers for next MAC calculation */
640 px = ++pSrc1;
641 py = pIn2;
643 /* Decrement the MAC count */
644 count--;
646 /* Decrement the loop counter */
647 blockSize3--;
650 #else
652 /* Run the below code for Cortex-M0 */
654 float32_t *pIn1 = pSrcA; /* inputA pointer */
655 float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
656 float32_t sum; /* Accumulator */
657 uint32_t i = 0u, j; /* loop counters */
658 uint32_t inv = 0u; /* Reverse order flag */
659 uint32_t tot = 0u; /* Length */
661 /* The algorithm implementation is based on the lengths of the inputs. */
662 /* srcB is always made to slide across srcA. */
663 /* So srcBLen is always considered as shorter or equal to srcALen */
664 /* But CORR(x, y) is reverse of CORR(y, x) */
665 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
666 /* and a varaible, inv is set to 1 */
667 /* If lengths are not equal then zero pad has to be done to make the two
668 * inputs of same length. But to improve the performance, we assume zeroes
669 * in the output instead of zero padding either of the the inputs*/
670 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
671 * starting of the output buffer */
672 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
673 * ending of the output buffer */
674 /* Once the zero padding is done the remaining of the output is calcualted
675 * using convolution but with the shorter signal time shifted. */
677 /* Calculate the length of the remaining sequence */
678 tot = ((srcALen + srcBLen) - 2u);
680 if(srcALen > srcBLen)
682 /* Calculating the number of zeros to be padded to the output */
683 j = srcALen - srcBLen;
685 /* Initialise the pointer after zero padding */
686 pDst += j;
689 else if(srcALen < srcBLen)
691 /* Initialization to inputB pointer */
692 pIn1 = pSrcB;
694 /* Initialization to the end of inputA pointer */
695 pIn2 = pSrcA + (srcALen - 1u);
697 /* Initialisation of the pointer after zero padding */
698 pDst = pDst + tot;
700 /* Swapping the lengths */
701 j = srcALen;
702 srcALen = srcBLen;
703 srcBLen = j;
705 /* Setting the reverse flag */
706 inv = 1;
710 /* Loop to calculate convolution for output length number of times */
711 for (i = 0u; i <= tot; i++)
713 /* Initialize sum with zero to carry on MAC operations */
714 sum = 0.0f;
716 /* Loop to perform MAC operations according to convolution equation */
717 for (j = 0u; j <= i; j++)
719 /* Check the array limitations */
720 if((((i - j) < srcBLen) && (j < srcALen)))
722 /* z[i] += x[i-j] * y[j] */
723 sum += pIn1[j] * pIn2[-((int32_t) i - j)];
726 /* Store the output in the destination buffer */
727 if(inv == 1)
728 *pDst-- = sum;
729 else
730 *pDst++ = sum;
733 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
737 /**
738 * @} end of Corr group