ext/OGDF/src/energybased/FMEMultipoleKernel.cpp

   1 /*
   2  * $Revision: 2552 $
   3  *
   4  * last checkin:
   5  *   $Author: gutwenger $
   6  *   $Date: 2012-07-05 16:45:20 +0200 (Do, 05. Jul 2012) $
   7  ***************************************************************/
   8
   9 /** \file
  10  * \brief Implementation of class FMEMultipoleKernel.
  11  *
  12  * \author Martin Gronemann
  13  *
  14  * \par License:
  15  * This file is part of the Open Graph Drawing Framework (OGDF).
  16  *
  17  * \par
  18  * Copyright (C)<br>
  19  * See README.txt in the root directory of the OGDF installation for details.
  20  *
  21  * \par
  22  * This program is free software; you can redistribute it and/or
  23  * modify it under the terms of the GNU General Public License
  24  * Version 2 or 3 as published by the Free Software Foundation;
  25  * see the file LICENSE.txt included in the packaging of this file
  26  * for details.
  27  *
  28  * \par
  29  * This program is distributed in the hope that it will be useful,
  30  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  31  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  32  * GNU General Public License for more details.
  33  *
  34  * \par
  35  * You should have received a copy of the GNU General Public
  36  * License along with this program; if not, write to the Free
  37  * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  38  * Boston, MA 02110-1301, USA.
  39  *
  40  * \see  http://www.gnu.org/copyleft/gpl.html
  41  ***************************************************************/
  42
  43 #include "FMEMultipoleKernel.h"
  44 #include "ArrayGraph.h"
  45 #include "LinearQuadtree.h"
  46 #include "LinearQuadtreeBuilder.h"
  47 #include "LinearQuadtreeExpansion.h"
  48 #include "WSPD.h"
  49 #include <algorithm>
  50
  51 namespace ogdf {
  52
  53 void FMEMultipoleKernel::quadtreeConstruction(ArrayPartition& pointPartition)
  54 {
  55         FMELocalContext*  localContext  = m_pLocalContext;
  56         FMEGlobalContext* globalContext = m_pGlobalContext;
  57         LinearQuadtree& tree                    = *globalContext->pQuadtree;
  58
  59         // precompute the bounding box for the quadtree points from the graph nodes
  60         for_loop(pointPartition, min_max_x_function(localContext));
  61         for_loop(pointPartition, min_max_y_function(localContext));
  62
  63         // wait until the thread's bounding box is computed
  64         sync();
  65
  66         // let the main thread computed the bounding box of the bounding boxes
  67         if (isMainThread())
  68         {
  69                 globalContext->min_x = globalContext->pLocalContext[0]->min_x;
  70                 globalContext->min_y = globalContext->pLocalContext[0]->min_y;
  71                 globalContext->max_x = globalContext->pLocalContext[0]->max_x;
  72                 globalContext->max_y = globalContext->pLocalContext[0]->max_y;
  73                 for (__uint32 j=1; j < numThreads(); j++)
  74                 {
  75                         globalContext->min_x = min(globalContext->min_x, globalContext->pLocalContext[j]->min_x);
  76                         globalContext->min_y = min(globalContext->min_y, globalContext->pLocalContext[j]->min_y);
  77                         globalContext->max_x = max(globalContext->max_x, globalContext->pLocalContext[j]->max_x);
  78                         globalContext->max_y = max(globalContext->max_y, globalContext->pLocalContext[j]->max_y);
  79                 }
  80                 tree.init(globalContext->min_x, globalContext->min_y, globalContext->max_x, globalContext->max_y);
  81                 globalContext->coolDown *= 0.999f;
  82                 tree.clear();
  83         }
  84         // wait because the morton number computation needs the bounding box
  85         sync();
  86         // udpate morton number to prepare them for sorting
  87         for_loop(pointPartition, LQMortonFunctor(localContext));
  88         // wait so we can sort them by morton number
  89         sync();
  90
  91 #ifdef OGDF_FME_PARALLEL_QUADTREE_SORT
  92         // use a simple parallel sorting algorithm
  93         LinearQuadtree::LQPoint* points = tree.pointArray();
  94         sort_parallel(points, tree.numberOfPoints(), LQPointComparer);
  95 #else
  96         if (isMainThread())
  97         {
  98                 LinearQuadtree::LQPoint* points = tree.pointArray();
  99                 sort_single(points, tree.numberOfPoints(), LQPointComparer);
 100         }
 101 #endif
 102         // wait because the quadtree builder needs the sorted order
 103         sync();
 104         // if not a parallel run, we can do the easy way
 105         if (isSingleThreaded())
 106         {
 107                 LinearQuadtreeBuilder builder(tree);
 108                 // prepare the tree
 109                 builder.prepareTree();
 110                 // and link it
 111                 builder.build();
 112                 LQPartitioner partitioner( localContext );
 113                 partitioner.partition();
 114         } else // the more difficult part
 115         {
 116                 // snap the left point of the interval of the thread to the first in the cell
 117                 LinearQuadtree::PointID beginPoint = tree.findFirstPointInCell(pointPartition.begin);
 118                 LinearQuadtree::PointID endPoint_plus_one;
 119                 // if this thread is the last one, no snapping required for the right point
 120                 if (threadNr()==numThreads()-1)
 121                         endPoint_plus_one = tree.numberOfPoints();
 122                 else // find the left point of the next thread
 123                         endPoint_plus_one = tree.findFirstPointInCell(pointPartition.end+1);
 124
 125                 // now we can prepare the snapped interval
 126                 LinearQuadtreeBuilder builder(tree);
 127                 // this function prepares the tree from begin point to endPoint_plus_one-1 (EXCLUDING endPoint_plus_one)
 128                 builder.prepareTree(beginPoint, endPoint_plus_one);
 129                 // save the start, end and count of the inner node chain in the context
 130                 localContext->firstInnerNode = builder.firstInner;
 131                 localContext->lastInnerNode = builder.lastInner;
 132                 localContext->numInnerNodes = builder.numInnerNodes;
 133                 // save the start, end and count of the leaf node chain in the context
 134                 localContext->firstLeaf = builder.firstLeaf;
 135                 localContext->lastLeaf = builder.lastLeaf;
 136                 localContext->numLeaves = builder.numLeaves;
 137                 // wait until all are finished
 138                 sync();
 139
 140                 // now the main thread has to link the tree
 141                 if (isMainThread())
 142                 {
 143                         // with his own builder
 144                         LinearQuadtreeBuilder sbuilder(tree);
 145                         // first we need the complete chain data
 146                         sbuilder.firstInner = globalContext->pLocalContext[0]->firstInnerNode;
 147                         sbuilder.firstLeaf = globalContext->pLocalContext[0]->firstLeaf;
 148                         sbuilder.numInnerNodes = globalContext->pLocalContext[0]->numInnerNodes;
 149                         sbuilder.numLeaves = globalContext->pLocalContext[0]->numLeaves;
 150                         for (__uint32 j=1; j < numThreads(); j++)
 151                         {
 152                                 sbuilder.numLeaves += globalContext->pLocalContext[j]->numLeaves;
 153                                 sbuilder.numInnerNodes += globalContext->pLocalContext[j]->numInnerNodes;
 154                         }
 155                         sbuilder.lastInner = globalContext->pLocalContext[numThreads()-1]->lastInnerNode;
 156                         sbuilder.lastLeaf = globalContext->pLocalContext[numThreads()-1]->lastLeaf;
 157                         // Link the tree
 158                         sbuilder.build();
 159                         // and run the partitions
 160                         LQPartitioner partitioner(localContext);
 161                         partitioner.partition();
 162                 }
 163         }
 164         // wait for tree to finish
 165         sync();
 166         // now update the copy of the point data
 167         for_loop(pointPartition, LQPointUpdateFunctor(localContext));
 168         // compute the nodes coordinates and sizes
 169         tree.forall_tree_nodes(LQCoordsFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
 170         tree.forall_tree_nodes(LQCoordsFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
 171 }
 172
 173
 174
 175
 176
 177 void FMEMultipoleKernel::multipoleApproxSingleThreaded(ArrayPartition& nodePointPartition)
 178 {
 179         FMELocalContext*  localContext  = m_pLocalContext;
 180         FMEGlobalContext* globalContext = m_pGlobalContext;
 181         LinearQuadtree& tree                    = *globalContext->pQuadtree;
 182         if (isMainThread())
 183         {
 184                 tree.bottom_up_traversal(                                       // do a bottom up traversal M2M pass
 185                         if_then_else(tree.is_leaf_condition(),  // if the current node is a leaf
 186                                 p2m_function(localContext),                     // then calculate the multipole coeff. due to the points in the leaf
 187                                 m2m_function(localContext)                      // else shift the coefficents of all children to center of the inner node
 188                         )
 189                 )(tree.root());
 190
 191                 tree.forall_well_separated_pairs(                               // do a wspd traversal M2L direct eval
 192                         pair_vice_versa(m2l_function(localContext)),// M2L for a well-separated pair
 193                         p2p_function(localContext),                                     // direct evaluation
 194                         p2p_function(localContext)                                      // direct evaluation
 195                 )(tree.root());
 196
 197                 tree.top_down_traversal(                                                // top down traversal
 198                         if_then_else( tree.is_leaf_condition(),         // if the node is a leaf
 199                                 do_nothing(),                                                   // then do nothing, we will deal with this case later
 200                                 l2l_function(localContext)                              // else shift the nodes local coeffs to the children
 201                         )
 202                 )(tree.root());// start at the root
 203
 204                 // evaluate all leaves and store the forces in the threads array
 205                 for_loop(nodePointPartition,                            // loop over points
 206                         func_comp(                                                              // composition of two statements
 207                                 l2p_function(localContext),                     // evaluate the forces due to the local expansion in the corresponding leaf
 208                                 collect_force_function                          // collect the forces of all threads with the following options:
 209                                 <
 210                                         COLLECT_REPULSIVE_FACTOR |              // multiply by the repulsive factor stored in the global options
 211                                         COLLECT_TREE_2_GRAPH_ORDER |    // threads data is stored in quadtree leaf order, transform it into graph order
 212                                         COLLECT_ZERO_THREAD_ARRAY               // reset threads array
 213                                 >(localContext)
 214                         )
 215                 );
 216         }
 217 }
 218
 219 //! the original algorithm which runs the WSPD completely single threaded
 220 void FMEMultipoleKernel::multipoleApproxSingleWSPD(ArrayPartition& nodePointPartition)
 221 {
 222         FMELocalContext*  localContext  = m_pLocalContext;
 223         FMEGlobalContext* globalContext = m_pGlobalContext;
 224         LinearQuadtree& tree                    = *globalContext->pQuadtree;
 225
 226         // let the main thread run the WSPD
 227         if (isMainThread())
 228         {
 229                 // The Well-separated pairs decomposition
 230                 tree.forall_well_separated_pairs(               // do a wspd traversal
 231                                 tree.StoreWSPairFunction(),             // store the ws pairs in the WSPD
 232                                 tree.StoreDirectPairFunction(), // store the direct pairs
 233                                 tree.StoreDirectNodeFunction()  // store the direct nodes
 234                         )(tree.root()); // start at the root
 235         }
 236
 237         // Note: dont wait for the WSPD to finish. We dont need it yet for
 238         // the big multihreaded bottom up traversal.
 239         for_tree_partition(                                                             // for all roots in the threads tree partition
 240                 tree.bottom_up_traversal(                                       // do a bottom up traversal
 241                         if_then_else(tree.is_leaf_condition(),  // if the current node is a leaf
 242                                 p2m_function(localContext),                     // then calculate the multipole coeff. due to the points in the leaf
 243                                 m2m_function(localContext)                      // else shift the coefficents of all children to center of the inner node
 244                         )
 245                 )
 246         );
 247         sync();
 248         // top of the tree has to be done by the main thread
 249         if (isMainThread())
 250         {
 251                 tree.bottom_up_traversal(                                               // start a bottom up traversal
 252                                 if_then_else(tree.is_leaf_condition(),  // if the current node is a leaf
 253                                         p2m_function(localContext),                     // then calculate the multipole coeff. due to the points in the leaf
 254                                         m2m_function(localContext)                      // else shift the coefficents of all children to center of the inner node
 255                                 ),
 256                                 not_condition(tree.is_fence_condition()) // stop when the fence to the threads is reached
 257                         )(tree.root());// start at the root
 258         }
 259         // wait until all nodes in the quadtree have their mulipole coeff
 260         sync();
 261         // M2L pass with the WSPD
 262         tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
 263         tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
 264         // D2D pass and store in the thread force array
 265         for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext));
 266         // D pass and store in the thread force array
 267         for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext));
 268         // wait until all local coeffs and all direct forces are computed
 269         sync();
 270         // big multihreaded top down traversal. top of the tree has to be done by the main thread
 271         if (isMainThread())
 272         {
 273                 tree.top_down_traversal(                                        // top down traversal
 274                         if_then_else( tree.is_leaf_condition(), // if the node is a leaf
 275                                 do_nothing(),                                           // then do nothing, we will deal with L2P pass later
 276                                 l2l_function(localContext)                      // else shift the nodes local coeffs to the children
 277                         ),
 278                         not_condition(tree.is_fence_condition()) //stop when the fence to the threads is reached
 279                 )(tree.root());// start at the root
 280         }
 281         sync();
 282         // the rest is done by the threads
 283         for_tree_partition(                                                             // for all roots in the threads tree partition
 284                 tree.top_down_traversal(                                        // do a top down traversal
 285                         if_then_else( tree.is_leaf_condition(), // if the node is a leaf
 286                                 do_nothing(),                                           // then do nothing, we will deal with L2P pass later
 287                                 l2l_function(localContext)                      // else shift the nodes local coeffs to the children
 288                         )
 289                 )
 290         );
 291         // wait until the traversal is finished and all leaves have their accumulated local coeffs
 292         sync();
 293         // evaluate all leaves and store the forces in the threads array (Note: we can store them in the global array but then we have to use random access)
 294         // we can start immediately to collect the forces because we evaluated before point by point
 295         for_loop(nodePointPartition,                            // loop over threads points
 296                 func_comp(                                                              // composition of two statements
 297                         l2p_function(localContext),                     // evaluate the forces due to the local expansion in the corresponding leaf
 298                         collect_force_function                          // collect the forces of all threads with the following options:
 299                         <
 300                                 COLLECT_REPULSIVE_FACTOR |              // multiply by the repulsive factor stored in the global options
 301                                 COLLECT_TREE_2_GRAPH_ORDER |    // threads data is stored in quadtree leaf order, transform it into graph order
 302                                 COLLECT_ZERO_THREAD_ARRAY               // reset threads array to zero
 303                         >(localContext)
 304                 )
 305         );
 306 }
 307
 308
 309 //! new but slower method, parallel wspd computation without using the wspd structure
 310 void FMEMultipoleKernel::multipoleApproxNoWSPDStructure(ArrayPartition& nodePointPartition)
 311 {
 312         FMELocalContext*  localContext  = m_pLocalContext;
 313         FMEGlobalContext* globalContext = m_pGlobalContext;
 314         LinearQuadtree& tree                    = *globalContext->pQuadtree;
 315         // big multihreaded bottom up traversal.
 316         for_tree_partition(                                                             // for all roots in the threads tree partition
 317                 tree.bottom_up_traversal(                                       // do a bottom up traversal
 318                         if_then_else(tree.is_leaf_condition(),  // if the current node is a leaf
 319                                 p2m_function(localContext),                     // then calculate the multipole coeff. due to the points in the leaf
 320                                 m2m_function(localContext)                      // else shift the coefficents of all children to center of the inner node
 321                         )
 322                 )
 323         );
 324         sync();
 325
 326         // top of the tree has to be done by the main thread
 327         if (isMainThread())
 328         {
 329                 tree.bottom_up_traversal(                                       // start a bottom up traversal
 330                         if_then_else(tree.is_leaf_condition(),  // if the current node is a leaf
 331                                 p2m_function(localContext),                     // then calculate the multipole coeff. due to the points in the leaf
 332                                 m2m_function(localContext)                      // else shift the coefficents of all children to center of the inner node
 333                         ),
 334                         not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached
 335
 336                 tree.forall_well_separated_pairs(                                       // do a wspd traversal
 337                         pair_vice_versa(m2l_function(localContext)),    // M2L for a well-separated pair
 338                         p2p_function(localContext),                                             // direct evaluation
 339                         p2p_function(localContext),                                             // direct evaluation
 340                         not_condition(tree.is_fence_condition()))(tree.root());
 341         }
 342         // wait until all local coeffs and all direct forces are computed
 343         sync();
 344
 345         // now a wspd traversal for the roots in the tree partition
 346         for_tree_partition(
 347                 tree.forall_well_separated_pairs(                                       // do a wspd traversal
 348                         pair_vice_versa(m2l_function(localContext)),    // M2L for a well-separated pair
 349                         p2p_function(localContext),                                             // direct evaluation
 350                         p2p_function(localContext)                                              // direct evaluation
 351                 )
 352         );
 353         // wait until all local coeffs and all direct forces are computed
 354         sync();
 355
 356         // big multihreaded top down traversal. Top of the tree has to be done by the main thread
 357         if (isMainThread())
 358         {
 359                 tree.top_down_traversal(                                        // top down traversal
 360                         if_then_else( tree.is_leaf_condition(), // if the node is a leaf
 361                                 do_nothing(),                                           // then do nothing, we will deal with this case later
 362                                 l2l_function(localContext)                      // else shift the nodes local coeffs to the children
 363                         ),
 364                         not_condition(tree.is_fence_condition()) //stop when the fence to the threads is reached
 365                 )(tree.root());                                                          // start at the root
 366         }
 367         // wait until the traversal is finished and all roots of the threads have their coefficients
 368         sync();
 369
 370         for_tree_partition(                                                             // for all roots in the threads tree partition
 371                 tree.top_down_traversal(                                        // do a top down traversal
 372                         if_then_else( tree.is_leaf_condition(), // if the node is a leaf
 373                                 do_nothing(),                                           // then do nothing, we will deal with this case later
 374                                 l2l_function(localContext)                      // else shift the nodes local coeffs to the children
 375                         )
 376                 )
 377         );
 378         // wait until the traversal is finished and all leaves have their accumulated local coeffs
 379         sync();
 380         // evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access)
 381         // we can start immediately to collect the forces because we evaluated before point by point
 382         for_loop(nodePointPartition,                            // loop over threads points
 383                 func_comp(                                                              // composition of two statements
 384                         l2p_function(localContext),                     // evaluate the forces due to the local expansion in the corresponding leaf
 385                         collect_force_function                          // collect the forces of all threads with the following options:
 386                         <
 387                                 COLLECT_REPULSIVE_FACTOR |              // multiply by the repulsive factor stored in the global options
 388                                 COLLECT_TREE_2_GRAPH_ORDER |    // threads data is stored in quadtree leaf order, transform it into graph order
 389                                 COLLECT_ZERO_THREAD_ARRAY               // reset threads array
 390                         >(localContext)
 391                 )
 392         );
 393 }
 394
 395
 396 //! the final approximation algorithm which runs the wspd parallel without storing it in the threads subtrees
 397 void FMEMultipoleKernel::multipoleApproxFinal(ArrayPartition& nodePointPartition)
 398 {
 399         FMELocalContext*  localContext  = m_pLocalContext;
 400         FMEGlobalContext* globalContext = m_pGlobalContext;
 401         LinearQuadtree& tree                    = *globalContext->pQuadtree;
 402         // big multihreaded bottom up traversal.
 403         for_tree_partition(                                                             // for all roots in the threads tree partition
 404                 tree.bottom_up_traversal(                                       // do a bottom up traversal
 405                         if_then_else(tree.is_leaf_condition(),  // if the current node is a leaf
 406                                 p2m_function(localContext),                     // then calculate the multipole coeff. due to the points in the leaf
 407                                 m2m_function(localContext)                      // else shift the coefficents of all children to center of the inner node
 408                         )
 409                 )
 410         );
 411         sync();
 412         // top of the tree has to be done by the main thread
 413         if (isMainThread())
 414         {
 415                 tree.bottom_up_traversal(                                       // start a bottom up traversal
 416                         if_then_else(tree.is_leaf_condition(),  // if the current node is a leaf
 417                                 p2m_function(localContext),                     // then calculate the multipole coeff. due to the points in the leaf
 418                                 m2m_function(localContext)                      // else shift the coefficents of all children to center of the inner node
 419                         ),
 420                         not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached
 421
 422                 tree.forall_well_separated_pairs(       // do a wspd traversal
 423                         tree.StoreWSPairFunction(),             // store the ws pairs in the WSPD
 424                         tree.StoreDirectPairFunction(), // store the direct pairs
 425                         tree.StoreDirectNodeFunction(), // store the direct nodes
 426                         not_condition(tree.is_fence_condition()))(tree.root());
 427         }
 428         // wait for the main thread to finish
 429         sync();
 430
 431         // M2L pass with the WSPD for the result of the single threaded pass above
 432         tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
 433         tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
 434
 435         // D2D pass and store in the thread force array
 436         for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext));
 437         for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext));
 438
 439         // wait until all local coeffs and all direct forces are computed
 440         sync();
 441
 442         // the rest of the WSPD can be done on the fly by the thread
 443         for_tree_partition(
 444                 tree.forall_well_separated_pairs(                                       // do a wspd traversal
 445                         pair_vice_versa(m2l_function(localContext)),    // M2L for a well-separated pair
 446                         p2p_function(localContext),                                             // direct evaluation
 447                         p2p_function(localContext)                                              // direct evaluation
 448                 )
 449         );
 450         // wait until all local coeffs and all direct forces are computed
 451         sync();
 452
 453         // big multihreaded top down traversal. top of the tree has to be done by the main thread
 454         if (isMainThread())
 455         {
 456                 tree.top_down_traversal(                                                // top down traversal L2L pass
 457                         if_then_else( tree.is_leaf_condition(),         // if the node is a leaf
 458                                 do_nothing(),                                                   // then do nothing, we will deal with this case later
 459                                 l2l_function(localContext)                              // else shift the nodes local coeffs to the children
 460                         ),
 461                         not_condition(tree.is_fence_condition())        // stop when the fence to the threads is reached
 462                 )(tree.root());                                                                 // start at the root,
 463         }
 464         // wait for the top of the tree
 465         sync();
 466
 467         for_tree_partition(                                                             // for all roots in the threads tree partition L2L pass
 468                 tree.top_down_traversal(                                        // do a top down traversal
 469                         if_then_else( tree.is_leaf_condition(), // if the node is a leaf
 470                                 do_nothing(),                                           // then do nothing, we will deal with this case later
 471                                 l2l_function(localContext)                      // else shift the nodes local coeffs to the children
 472                         )
 473                 )
 474         );
 475         // wait until the traversal is finished and all leaves have their accumulated local coeffs
 476         sync();
 477         // evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access)
 478         // we can start immediately to collect the forces because we evaluated before point by point
 479         for_loop(nodePointPartition,                            // loop over threads points
 480                 func_comp(                                                              // composition of two statements
 481                         l2p_function(localContext),                     // evaluate the forces due to the local expansion in the corresponding leaf
 482                         collect_force_function                          // collect the forces of all threads with the following options:
 483                         <
 484                                 COLLECT_REPULSIVE_FACTOR |              // multiply by the repulsive factor stored in the global options
 485                                 COLLECT_TREE_2_GRAPH_ORDER |    // threads data is stored in quadtree leaf order, transform it into graph order
 486                                 COLLECT_ZERO_THREAD_ARRAY               // reset threads array
 487                         >(localContext)
 488                 )
 489         );
 490 }
 491
 492
 493
 494 void FMEMultipoleKernel::operator()(FMEGlobalContext* globalContext)
 495 {
 496         __uint32                                        maxNumIterations    =  globalContext->pOptions->maxNumIterations;
 497         __uint32                                        minNumIterations    =  globalContext->pOptions->minNumIterations;
 498         ArrayGraph&                                     graph                           = *globalContext->pGraph;
 499         LinearQuadtree&                         tree                            = *globalContext->pQuadtree;
 500         LinearQuadtreeExpansion&        treeExp                         = *globalContext->pExpansion;
 501         FMELocalContext*                        localContext            = globalContext->pLocalContext[threadNr()];
 502         FMEGlobalOptions*                       options                         = globalContext->pOptions;
 503         float*                                          threadsForceArrayX      = localContext->forceX;
 504         float*                                          threadsForceArrayY      = localContext->forceY;
 505         float*                                          globalForceArrayX       = globalContext->globalForceX;
 506         float*                                          globalForceArrayY       = globalContext->globalForceY;
 507
 508         ArrayPartition edgePartition = arrayPartition(graph.numEdges());
 509         ArrayPartition nodePointPartition = arrayPartition(graph.numNodes());
 510
 511         m_pLocalContext = localContext;
 512         m_pGlobalContext = globalContext;
 513         /****************************/
 514         /* INIT                                         */
 515         /****************************/
 516         //! reset the global force array
 517         for_loop_array_set(threadNr(), numThreads(), globalForceArrayX, tree.numberOfPoints(), 0.0f);
 518         for_loop_array_set(threadNr(), numThreads(), globalForceArrayY, tree.numberOfPoints(), 0.0f);
 519
 520         // reset the threads force array
 521         for (__uint32 i = 0; i < tree.numberOfPoints(); i++)
 522         {
 523                 threadsForceArrayX[i] = 0.0f;
 524                 threadsForceArrayY[i] = 0.0f;
 525         }
 526
 527         __uint32 maxNumIt = options->preProcMaxNumIterations;
 528         for (__uint32 currNumIteration = 0; ((currNumIteration < maxNumIt) ); currNumIteration++)
 529         {
 530                 // iterate over all edges and store the resulting forces in the threads array
 531                 for_loop(edgePartition,
 532                         edge_force_function< EDGE_FORCE_DIV_DEGREE > (localContext)     // divide the forces by degree of the node to avoid oscilation
 533                         );
 534                 // wait until all edges are done
 535                 sync();
 536                 // now collect the forces in parallel and put the sum into the global array and move the nodes accordingly
 537                 for_loop(nodePointPartition,
 538                         func_comp(
 539                         collect_force_function<COLLECT_EDGE_FACTOR_PREP | COLLECT_ZERO_THREAD_ARRAY >(localContext),
 540                         node_move_function<TIME_STEP_PREP | ZERO_GLOBAL_ARRAY>(localContext)
 541                         )
 542                         );
 543         }
 544         if (isMainThread())
 545         {
 546                 globalContext->coolDown = 1.0f;
 547         }
 548         sync();
 549
 550         for (__uint32 currNumIteration = 0; ((currNumIteration < maxNumIterations) && !globalContext->earlyExit); currNumIteration++)
 551         {
 552                 // reset the coefficients
 553                 for_loop_array_set(threadNr(), numThreads(), treeExp.m_multiExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0);
 554                 for_loop_array_set(threadNr(), numThreads(), treeExp.m_localExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0);
 555
 556                 localContext->maxForceSq = 0.0;
 557                 localContext->avgForce = 0.0;
 558
 559                 // construct the quadtree
 560                 quadtreeConstruction(nodePointPartition);
 561                 // wait for all threads to finish
 562                 sync();
 563
 564                 if (isSingleThreaded()) // if is single threaded run the simple approximation
 565                         multipoleApproxSingleThreaded(nodePointPartition);
 566                 else // otherwise use the partitioning
 567                         multipoleApproxFinal(nodePointPartition);
 568                 // now wait until all forces are summed up in the global array and mapped to graph node order
 569                 sync();
 570
 571                 // run the edge forces
 572                 for_loop(edgePartition,                                                 // iterate over all edges and sum up the forces in the threads array
 573                         edge_force_function< EDGE_FORCE_DIV_DEGREE >(localContext)      // divide the forces by degree of the node to avoid oscilation
 574                 );
 575                 // wait until edges are finished
 576                 sync();
 577
 578                 // collect the edge forces and move nodes without waiting
 579                 for_loop(nodePointPartition,
 580                         func_comp(
 581                                  collect_force_function<COLLECT_EDGE_FACTOR | COLLECT_ZERO_THREAD_ARRAY>(localContext),
 582                                  node_move_function<TIME_STEP_NORMAL | ZERO_GLOBAL_ARRAY>(localContext)
 583                         )
 584                 );
 585                 // wait so we can decide if we need another iteration
 586                 sync();
 587                 // check the max force square for all threads
 588                 if (isMainThread())
 589                 {
 590                         double maxForceSq = 0.0;
 591                         for (__uint32 j=0; j < numThreads(); j++)
 592                                 maxForceSq = max(globalContext->pLocalContext[j]->maxForceSq, maxForceSq);
 593
 594                         // if we are allowed to quit and the max force sq falls under the threshold tell all threads we are done
 595                         if ((currNumIteration >= minNumIterations) && (maxForceSq < globalContext->pOptions->stopCritForce ))
 596                         {
 597                                 globalContext->earlyExit = true;
 598                         }
 599                 }
 600                 // this is required to wait for the earlyExit result
 601                 sync();
 602         }
 603 }
 604
 605 //! allcates the global context and for all threads a local context
 606 FMEGlobalContext* FMEMultipoleKernel::allocateContext(ArrayGraph* pGraph, FMEGlobalOptions* pOptions, __uint32 numThreads)
 607 {
 608         FMEGlobalContext* globalContext = new FMEGlobalContext();
 609
 610         globalContext->numThreads = numThreads;
 611         globalContext->pOptions = pOptions;
 612         globalContext->pGraph = pGraph;
 613         globalContext->pQuadtree = new LinearQuadtree(pGraph->numNodes(), pGraph->nodeXPos(), pGraph->nodeYPos(), pGraph->nodeSize());
 614         globalContext->pWSPD = globalContext->pQuadtree->wspd();
 615         globalContext->pExpansion = new LinearQuadtreeExpansion(globalContext->pOptions->multipolePrecision, (*globalContext->pQuadtree));
 616         __uint32 numPoints = globalContext->pQuadtree->numberOfPoints();
 617         typedef FMELocalContext* FMELocalContextPtr;
 618
 619         globalContext->pLocalContext = new FMELocalContextPtr[numThreads];
 620         globalContext->globalForceX = (float*)MALLOC_16(sizeof(float)*numPoints);
 621         globalContext->globalForceY = (float*)MALLOC_16(sizeof(float)*numPoints);
 622         for (__uint32 i=0; i < numThreads; i++)
 623         {
 624                 globalContext->pLocalContext[i] = new FMELocalContext;
 625                 globalContext->pLocalContext[i]->forceX = (float*)MALLOC_16(sizeof(float)*numPoints);
 626                 globalContext->pLocalContext[i]->forceY = (float*)MALLOC_16(sizeof(float)*numPoints);
 627                 globalContext->pLocalContext[i]->pGlobalContext = globalContext;
 628         }
 629         return globalContext;
 630 }
 631
 632 //! frees the memory for all contexts
 633 void FMEMultipoleKernel::deallocateContext(FMEGlobalContext* globalContext)
 634 {
 635         __uint32 numThreads = globalContext->numThreads;
 636         for (__uint32 i=0; i < numThreads; i++)
 637         {
 638                 FREE_16(globalContext->pLocalContext[i]->forceX);
 639                 FREE_16(globalContext->pLocalContext[i]->forceY);
 640                 delete globalContext->pLocalContext[i];
 641         }
 642         FREE_16(globalContext->globalForceX);
 643         FREE_16(globalContext->globalForceY);
 644         delete[] globalContext->pLocalContext;
 645         delete globalContext->pExpansion;
 646         delete globalContext->pQuadtree;
 647         delete globalContext;
 648 }
 649
 650 }