|
| 1 | +/****************************************************************************** |
| 2 | + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. |
| 3 | + * |
| 4 | + * Redistribution and use in source and binary forms, with or without |
| 5 | + * modification, are permitted provided that the following conditions are met: |
| 6 | + * * Redistributions of source code must retain the above copyright |
| 7 | + * notice, this list of conditions and the following disclaimer. |
| 8 | + * * Redistributions in binary form must reproduce the above copyright |
| 9 | + * notice, this list of conditions and the following disclaimer in the |
| 10 | + * documentation and/or other materials provided with the distribution. |
| 11 | + * * Neither the name of the NVIDIA CORPORATION nor the |
| 12 | + * names of its contributors may be used to endorse or promote products |
| 13 | + * derived from this software without specific prior written permission. |
| 14 | + * |
| 15 | + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 16 | + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 17 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 18 | + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
| 19 | + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| 20 | + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 21 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| 22 | + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 23 | + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 24 | + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 25 | + * |
| 26 | + ******************************************************************************/ |
| 27 | + |
| 28 | +/****************************************************************************** |
| 29 | + * |
| 30 | + * Code and text by Sean Baxter, NVIDIA Research |
| 31 | + * See http://nvlabs.github.io/moderngpu for repository and documentation. |
| 32 | + * |
| 33 | + ******************************************************************************/ |
| 34 | + |
| 35 | +#pragma once |
| 36 | + |
| 37 | +#include "ctasearch.cuh" |
| 38 | +#include "loadstore.cuh" |
| 39 | + |
| 40 | +namespace mgpu { |
| 41 | + |
| 42 | +//////////////////////////////////////////////////////////////////////////////// |
| 43 | +// DeviceLoadBalancingSearch |
| 44 | +// Upper Bound search from A (needles) into B (haystack). The A values are |
| 45 | +// natural numbers from aBegin to aEnd. bFirst is the index of the B value at |
| 46 | +// bBegin in shared memory. |
| 47 | + |
| 48 | +template<int VT, bool RangeCheck> |
| 49 | +MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin, |
| 50 | + int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) { |
| 51 | + |
| 52 | + int bKey = b_shared[bBegin]; |
| 53 | + |
| 54 | + #pragma unroll |
| 55 | + for(int i = 0; i < VT; ++i) { |
| 56 | + bool p; |
| 57 | + if(RangeCheck) |
| 58 | + p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey)); |
| 59 | + else |
| 60 | + p = aBegin < bKey; |
| 61 | + |
| 62 | + if(p) |
| 63 | + // Advance A (the needle). |
| 64 | + a_shared[aBegin++] = bFirst + bBegin; |
| 65 | + else |
| 66 | + // Advance B (the haystack). |
| 67 | + bKey = b_shared[++bBegin]; |
| 68 | + } |
| 69 | +} |
| 70 | + |
| 71 | +//////////////////////////////////////////////////////////////////////////////// |
| 72 | +// CTALoadBalance |
| 73 | +// Computes upper_bound(counting_iterator<int>(first), b_global) - 1. |
| 74 | + |
| 75 | +// Unlike most other CTA* functions, CTALoadBalance loads from global memory. |
| 76 | +// This returns the loaded B elements at the beginning or end of shared memory |
| 77 | +// depending on the aFirst argument. |
| 78 | + |
| 79 | +// CTALoadBalance requires NT * VT + 2 slots of shared memory. |
| 80 | +template<int NT, int VT, typename InputIt> |
| 81 | +MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global, |
| 82 | + int sourceCount, int block, int tid, const int* mp_global, |
| 83 | + int* indices_shared, bool loadPrecedingB) { |
| 84 | + |
| 85 | + int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT, |
| 86 | + mp_global); |
| 87 | + |
| 88 | + int a0 = range.x; |
| 89 | + int a1 = range.y; |
| 90 | + int b0 = range.z; |
| 91 | + int b1 = range.w; |
| 92 | + if(!b0) loadPrecedingB = false; |
| 93 | + |
| 94 | + // Load one trailing term from B. If we're already at the end, fill the |
| 95 | + // end of the buffer with destCount. |
| 96 | + int aCount = a1 - a0; |
| 97 | + int bCount = b1 - b0; |
| 98 | + int extended = b1 < sourceCount; |
| 99 | + int loadCount = bCount + extended; |
| 100 | + int fillCount = NT * VT + 1 - loadCount - aCount; |
| 101 | + |
| 102 | + int* a_shared = indices_shared; |
| 103 | + int* b_shared = indices_shared + aCount + (int)loadPrecedingB; |
| 104 | + |
| 105 | + // Load the B values. |
| 106 | +// DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB, |
| 107 | +// b_global + b0 - (int)loadPrecedingB, tid, |
| 108 | +// b_shared - (int)loadPrecedingB); |
| 109 | + |
| 110 | + for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT) |
| 111 | + b_shared[i] = b_global[b0 + i]; |
| 112 | + |
| 113 | + // Fill the end of the array with destCount. |
| 114 | + for(int i = tid + extended; i < fillCount; i += NT) |
| 115 | + b_shared[bCount + i] = destCount; |
| 116 | + __syncthreads(); |
| 117 | + |
| 118 | + // Run a merge path to find the start of the serial merge for each thread. |
| 119 | + int diag = VT * tid; |
| 120 | + int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0), |
| 121 | + aCount, b_shared, bCount, diag, mgpu::less<int>()); |
| 122 | + |
| 123 | + int a0tid = a0 + mp; |
| 124 | + int b0tid = diag - mp; |
| 125 | + |
| 126 | + // Subtract 1 from b0 because we want to return upper_bound - 1. |
| 127 | + DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1, |
| 128 | + b0tid, bCount, a_shared - a0); |
| 129 | + __syncthreads(); |
| 130 | + |
| 131 | + b0 -= (int)loadPrecedingB; |
| 132 | + return make_int4(a0, a1, b0, b1); |
| 133 | +} |
| 134 | + |
| 135 | + |
| 136 | +} // namespace mgpu |
0 commit comments