lib/main/STM32F3/Drivers/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        19. March 2015
   5 * $Revision:    V.1.4.5
   6 *
   7 * Project:      CMSIS DSP Library
   8 * Title:            arm_cmplx_mat_mult_q15.c
   9 *
  10 * Description:   Q15 complex matrix multiplication.
  11 *
  12 * Target Processor:          Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40 #include "arm_math.h"
  41
  42 /**
  43  * @ingroup groupMatrix
  44  */
  45
  46 /**
  47  * @addtogroup CmplxMatrixMult
  48  * @{
  49  */
  50
  51
  52 /**
  53  * @brief Q15 Complex matrix multiplication
  54  * @param[in]       *pSrcA points to the first input complex matrix structure
  55  * @param[in]       *pSrcB points to the second input complex matrix structure
  56  * @param[out]      *pDst points to output complex matrix structure
  57  * @param[in]           *pScratch points to the array for storing intermediate results
  58  * @return              The function returns either
  59  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  60  *
  61  * \par Conditions for optimum performance
  62  *  Input, output and state buffers should be aligned by 32-bit
  63  *
  64  * \par Restrictions
  65  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  66  *      In this case input, output, scratch buffers should be aligned by 32-bit
  67  *
  68  * @details
  69  * <b>Scaling and Overflow Behavior:</b>
  70  *
  71  * \par
  72  * The function is implemented using a 64-bit internal accumulator. The inputs to the
  73  * multiplications are in 1.15 format and multiplications yield a 2.30 result.
  74  * The 2.30 intermediate
  75  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
  76  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
  77  * truncated to 34.15 format by discarding the low 15 bits and then saturated to
  78  * 1.15 format.
  79  *
  80  * \par
  81  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
  82  *
  83  */
  84
  85
  86
  87
  88 arm_status arm_mat_cmplx_mult_q15(
  89   const arm_matrix_instance_q15 * pSrcA,
  90   const arm_matrix_instance_q15 * pSrcB,
  91   arm_matrix_instance_q15 * pDst,
  92   q15_t * pScratch)
  93 {
  94   /* accumulator */
  95   q15_t *pSrcBT = pScratch;                      /* input data matrix pointer for transpose */
  96   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  97   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  98   q15_t *px;                                     /* Temporary output data matrix pointer */
  99   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
 100   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
 101   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
 102   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
 103   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
 104   arm_status status;                             /* status of matrix multiplication */
 105   q63_t sumReal, sumImag;
 106
 107 #ifdef UNALIGNED_SUPPORT_DISABLE
 108   q15_t in;                                      /* Temporary variable to hold the input value */
 109   q15_t a, b, c, d;
 110 #else
 111   q31_t in;                                      /* Temporary variable to hold the input value */
 112   q31_t prod1, prod2;
 113   q31_t pSourceA, pSourceB;
 114 #endif
 115
 116 #ifdef ARM_MATH_MATRIX_CHECK
 117   /* Check for matrix mismatch condition */
 118   if((pSrcA->numCols != pSrcB->numRows) ||
 119      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 120   {
 121     /* Set status as ARM_MATH_SIZE_MISMATCH */
 122     status = ARM_MATH_SIZE_MISMATCH;
 123   }
 124   else
 125 #endif
 126   {
 127     /* Matrix transpose */
 128     do
 129     {
 130       /* Apply loop unrolling and exchange the columns with row elements */
 131       col = numColsB >> 2;
 132
 133       /* The pointer px is set to starting address of the column being processed */
 134       px = pSrcBT + i;
 135
 136       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 137        ** a second loop below computes the remaining 1 to 3 samples. */
 138       while(col > 0u)
 139       {
 140 #ifdef UNALIGNED_SUPPORT_DISABLE
 141         /* Read two elements from the row */
 142         in = *pInB++;
 143         *px = in;
 144         in = *pInB++;
 145         px[1] = in;
 146
 147         /* Update the pointer px to point to the next row of the transposed matrix */
 148         px += numRowsB * 2;
 149
 150         /* Read two elements from the row */
 151         in = *pInB++;
 152         *px = in;
 153         in = *pInB++;
 154         px[1] = in;
 155
 156         /* Update the pointer px to point to the next row of the transposed matrix */
 157         px += numRowsB * 2;
 158
 159         /* Read two elements from the row */
 160         in = *pInB++;
 161         *px = in;
 162         in = *pInB++;
 163         px[1] = in;
 164
 165         /* Update the pointer px to point to the next row of the transposed matrix */
 166         px += numRowsB * 2;
 167
 168         /* Read two elements from the row */
 169         in = *pInB++;
 170         *px = in;
 171         in = *pInB++;
 172         px[1] = in;
 173
 174         /* Update the pointer px to point to the next row of the transposed matrix */
 175         px += numRowsB * 2;
 176
 177         /* Decrement the column loop counter */
 178         col--;
 179       }
 180
 181       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
 182        ** No loop unrolling is used. */
 183       col = numColsB % 0x4u;
 184
 185       while(col > 0u)
 186       {
 187         /* Read two elements from the row */
 188         in = *pInB++;
 189         *px = in;
 190         in = *pInB++;
 191         px[1] = in;
 192 #else
 193
 194         /* Read two elements from the row */
 195         in = *__SIMD32(pInB)++;
 196
 197         *__SIMD32(px) = in;
 198
 199         /* Update the pointer px to point to the next row of the transposed matrix */
 200         px += numRowsB * 2;
 201
 202
 203         /* Read two elements from the row */
 204         in = *__SIMD32(pInB)++;
 205
 206         *__SIMD32(px) = in;
 207
 208         /* Update the pointer px to point to the next row of the transposed matrix */
 209         px += numRowsB * 2;
 210
 211         /* Read two elements from the row */
 212         in = *__SIMD32(pInB)++;
 213
 214         *__SIMD32(px) = in;
 215
 216         /* Update the pointer px to point to the next row of the transposed matrix */
 217         px += numRowsB * 2;
 218
 219         /* Read two elements from the row */
 220         in = *__SIMD32(pInB)++;
 221
 222         *__SIMD32(px) = in;
 223
 224         /* Update the pointer px to point to the next row of the transposed matrix */
 225         px += numRowsB * 2;
 226
 227         /* Decrement the column loop counter */
 228         col--;
 229       }
 230
 231       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
 232        ** No loop unrolling is used. */
 233       col = numColsB % 0x4u;
 234
 235       while(col > 0u)
 236       {
 237         /* Read two elements from the row */
 238         in = *__SIMD32(pInB)++;
 239
 240         *__SIMD32(px) = in;
 241 #endif
 242
 243         /* Update the pointer px to point to the next row of the transposed matrix */
 244         px += numRowsB * 2;
 245
 246         /* Decrement the column loop counter */
 247         col--;
 248       }
 249
 250       i = i + 2u;
 251
 252       /* Decrement the row loop counter */
 253       row--;
 254
 255     } while(row > 0u);
 256
 257     /* Reset the variables for the usage in the following multiplication process */
 258     row = numRowsA;
 259     i = 0u;
 260     px = pDst->pData;
 261
 262     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 263     /* row loop */
 264     do
 265     {
 266       /* For every row wise process, the column loop counter is to be initiated */
 267       col = numColsB;
 268
 269       /* For every row wise process, the pIn2 pointer is set
 270        ** to the starting address of the transposed pSrcB data */
 271       pInB = pSrcBT;
 272
 273       /* column loop */
 274       do
 275       {
 276         /* Set the variable sum, that acts as accumulator, to zero */
 277         sumReal = 0;
 278         sumImag = 0;
 279
 280         /* Apply loop unrolling and compute 2 MACs simultaneously. */
 281         colCnt = numColsA >> 1;
 282
 283         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
 284         pInA = pSrcA->pData + i * 2;
 285
 286
 287         /* matrix multiplication */
 288         while(colCnt > 0u)
 289         {
 290           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 291
 292 #ifdef UNALIGNED_SUPPORT_DISABLE
 293
 294           /* read real and imag values from pSrcA buffer */
 295           a = *pInA;
 296           b = *(pInA + 1u);
 297           /* read real and imag values from pSrcB buffer */
 298           c = *pInB;
 299           d = *(pInB + 1u);
 300
 301           /* Multiply and Accumlates */
 302           sumReal += (q31_t) a *c;
 303           sumImag += (q31_t) a *d;
 304           sumReal -= (q31_t) b *d;
 305           sumImag += (q31_t) b *c;
 306
 307           /* read next real and imag values from pSrcA buffer */
 308           a = *(pInA + 2u);
 309           b = *(pInA + 3u);
 310           /* read next real and imag values from pSrcB buffer */
 311           c = *(pInB + 2u);
 312           d = *(pInB + 3u);
 313
 314           /* update pointer */
 315           pInA += 4u;
 316
 317           /* Multiply and Accumlates */
 318           sumReal += (q31_t) a *c;
 319           sumImag += (q31_t) a *d;
 320           sumReal -= (q31_t) b *d;
 321           sumImag += (q31_t) b *c;
 322           /* update pointer */
 323           pInB += 4u;
 324 #else
 325           /* read real and imag values from pSrcA and pSrcB buffer */
 326           pSourceA = *__SIMD32(pInA)++;
 327           pSourceB = *__SIMD32(pInB)++;
 328
 329           /* Multiply and Accumlates */
 330 #ifdef ARM_MATH_BIG_ENDIAN
 331           prod1 = -__SMUSD(pSourceA, pSourceB);
 332 #else
 333           prod1 = __SMUSD(pSourceA, pSourceB);
 334 #endif
 335           prod2 = __SMUADX(pSourceA, pSourceB);
 336           sumReal += (q63_t) prod1;
 337           sumImag += (q63_t) prod2;
 338
 339           /* read real and imag values from pSrcA and pSrcB buffer */
 340           pSourceA = *__SIMD32(pInA)++;
 341           pSourceB = *__SIMD32(pInB)++;
 342
 343           /* Multiply and Accumlates */
 344 #ifdef ARM_MATH_BIG_ENDIAN
 345           prod1 = -__SMUSD(pSourceA, pSourceB);
 346 #else
 347           prod1 = __SMUSD(pSourceA, pSourceB);
 348 #endif
 349           prod2 = __SMUADX(pSourceA, pSourceB);
 350           sumReal += (q63_t) prod1;
 351           sumImag += (q63_t) prod2;
 352
 353 #endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
 354
 355           /* Decrement the loop counter */
 356           colCnt--;
 357         }
 358
 359         /* process odd column samples */
 360         if((numColsA & 0x1u) > 0u)
 361         {
 362           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 363
 364 #ifdef UNALIGNED_SUPPORT_DISABLE
 365
 366           /* read real and imag values from pSrcA and pSrcB buffer */
 367           a = *pInA++;
 368           b = *pInA++;
 369           c = *pInB++;
 370           d = *pInB++;
 371
 372           /* Multiply and Accumlates */
 373           sumReal += (q31_t) a *c;
 374           sumImag += (q31_t) a *d;
 375           sumReal -= (q31_t) b *d;
 376           sumImag += (q31_t) b *c;
 377
 378 #else
 379           /* read real and imag values from pSrcA and pSrcB buffer */
 380           pSourceA = *__SIMD32(pInA)++;
 381           pSourceB = *__SIMD32(pInB)++;
 382
 383           /* Multiply and Accumlates */
 384 #ifdef ARM_MATH_BIG_ENDIAN
 385           prod1 = -__SMUSD(pSourceA, pSourceB);
 386 #else
 387           prod1 = __SMUSD(pSourceA, pSourceB);
 388 #endif
 389           prod2 = __SMUADX(pSourceA, pSourceB);
 390           sumReal += (q63_t) prod1;
 391           sumImag += (q63_t) prod2;
 392
 393 #endif /*      #ifdef UNALIGNED_SUPPORT_DISABLE */
 394
 395         }
 396
 397         /* Saturate and store the result in the destination buffer */
 398
 399         *px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
 400         *px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
 401
 402         /* Decrement the column loop counter */
 403         col--;
 404
 405       } while(col > 0u);
 406
 407       i = i + numColsA;
 408
 409       /* Decrement the row loop counter */
 410       row--;
 411
 412     } while(row > 0u);
 413
 414     /* set status as ARM_MATH_SUCCESS */
 415     status = ARM_MATH_SUCCESS;
 416   }
 417
 418   /* Return to application */
 419   return (status);
 420 }
 421
 422 /**
 423  * @} end of MatrixMult group
 424  */