Propagate
==========

python side
-------------




::

    096     @profile_if_possible
    097     def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
    098                   max_blocks=1024, max_steps=10, use_weights=False,
    099                   scatter_first=0):
    ...
    110         nphotons = self.pos.size
    111         step = 0
    112         input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
    113         input_queue[0] = 0
    114         # Order photons initially in the queue to put the clones next to each other
    115         for copy in xrange(self.ncopies):
    116             input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
    117         input_queue_gpu = ga.to_gpu(input_queue)
    118         output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
    119         output_queue[0] = 1
    120         output_queue_gpu = ga.to_gpu(output_queue)
    121 
    122         while step < max_steps:
    123             # Just finish the rest of the steps if the # of photons is low
    124             if nphotons < nthreads_per_block * 16 * 8 or use_weights:
    125                 nsteps = max_steps - step
    126             else:
    127                 nsteps = 1
    128 
    129             for first_photon, photons_this_round, blocks in \
    130                     chunk_iterator(nphotons, nthreads_per_block, max_blocks):
    131                 self.gpu_funcs.propagate(
                                 np.int32(first_photon), 
                                 np.int32(photons_this_round), 
                                 input_queue_gpu[1:], 
                                 output_queue_gpu, 
                                 rng_states, 
                                 self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, 
                                 self.last_hit_triangles, 
                                 self.weights, 
                                 np.int32(nsteps),   ## CAUTION thats max_steps on cuda side
                                 np.int32(use_weights), 
                                 np.int32(scatter_first), 
                                 gpu_geometry.gpudata, 

                                 block=(nthreads_per_block,1,1), grid=(blocks, 1))
    132 
    133             step += nsteps
    134             scatter_first = 0 # Only allow non-zero in first pass
    135 
    136             if step < max_steps:
    137                 temp = input_queue_gpu
    138                 input_queue_gpu = output_queue_gpu
    139                 output_queue_gpu = temp
    140                 # Assign with a numpy array of length 1 to silence
    141                 # warning from PyCUDA about setting array with different strides/storage orders.
    142                 output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
    143                 nphotons = input_queue_gpu[:1].get()[0] - 1

    ///         stick the surviving propagated photons in output_queue into input_queue  

    144 
    145         if ga.max(self.flags).get() & (1 << 31):
    146             print >>sys.stderr, "WARNING: ABORTED PHOTONS"
    147         cuda.Context.get_current().synchronize()




cuda entry points
-------------------

::

    simon:cuda blyth$ grep -l blockIdx *.*
    bvh.cu
    daq.cu
    hybrid_render.cu
    mesh.h
    pdf.cu
    propagate.cu
    random.h
    render.cu
    tools.cu
    transform.cu


cuda propagate
----------------

Entry point is **propagate**, communication via numpy arrays curtesy of pycuda. 

* **self.flags** on corresponds to  **histories** array 
* **id** identifies the CUDA thread, corresponding to a single photon
* photon parameters indexed into the arrays with `photon_id` 


::

    112 __global__ void
    113 propagate(int first_photon, int nthreads, unsigned int *input_queue,
    114       unsigned int *output_queue, curandState *rng_states,
    115       float3 *positions, float3 *directions,
    116       float *wavelengths, float3 *polarizations,
    117       float *times, unsigned int *histories,
    118       int *last_hit_triangles, float *weights,
    119       int max_steps, int use_weights, int scatter_first,
    120       Geometry *g)
    121 {
    122     __shared__ Geometry sg;
    123 
    124     if (threadIdx.x == 0)
    125     sg = *g;
    //
    // shared geometry between threads
    //
    126 
    127     __syncthreads();
    128 
    129     int id = blockIdx.x*blockDim.x + threadIdx.x;
    //
    //  id points at the single photon to propagate in this parallel thread
    //
    130 
    131     if (id >= nthreads)
    132     return;
    133 
    134     g = &sg;
    135 
    136     curandState rng = rng_states[id];
    137 
    138     int photon_id = input_queue[first_photon + id];
    139 
    140     Photon p;
    141     p.position = positions[photon_id];
    142     p.direction = directions[photon_id];
    143     p.direction /= norm(p.direction);
    144     p.polarization = polarizations[photon_id];
    145     p.polarization /= norm(p.polarization);
    146     p.wavelength = wavelengths[photon_id];
    147     p.time = times[photon_id];
    148     p.last_hit_triangle = last_hit_triangles[photon_id];
    149     p.history = histories[photon_id];
    150     p.weight = weights[photon_id];
    151 
    152     if (p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB | NAN_ABORT))
    153     return;
    154 
    155     State s;
    156 
    157     int steps = 0;
    158     while (steps < max_steps) {
    159     steps++;
    160 
    161     int command;
    162 
    163     // check for NaN and fail
    164     if (isnan(p.direction.x*p.direction.y*p.direction.z*p.position.x*p.position.y*p.position.z)) {
    165         p.history |= NO_HIT | NAN_ABORT;
    166         break;
    167     }
    168 
    169     fill_state(s, p, g);
    170 
    171     if (p.last_hit_triangle == -1)
    172         break;
    173 
    174     command = propagate_to_boundary(p, s, rng, use_weights, scatter_first);
    //
    //      propagate_* only changes p (?) refering to state s   
    //
    175     scatter_first = 0; // Only use the scatter_first value once
    176 
    177     if (command == BREAK)
    178         break;
    179 
    180     if (command == CONTINUE)
    181         continue;
    182 
    183     if (s.surface_index != -1) {
    184       command = propagate_at_surface(p, s, rng, g, use_weights);
    185 
    186         if (command == BREAK)
    187         break;
    188 
    189         if (command == CONTINUE)
    190         continue;
    191     }
    192 
    193     propagate_at_boundary(p, s, rng);
    194 
    195     } // while (steps < max_steps)
    196 
    197     rng_states[id] = rng;
    198     positions[photon_id] = p.position;
    199     directions[photon_id] = p.direction;
    200     polarizations[photon_id] = p.polarization;
    201     wavelengths[photon_id] = p.wavelength;
    202     times[photon_id] = p.time;
    203     histories[photon_id] = p.history;
    204     last_hit_triangles[photon_id] = p.last_hit_triangle;
    205     weights[photon_id] = p.weight;
    206 
    207     // Not done, put photon in output queue
    208     if ((p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB | NAN_ABORT)) == 0) {
    //
    //       the photon lives on thanks to 
    //            RAYLEIGH_SCATTER REFLECT_DIFFUSE REFLECT_SPECULAR SURFACE_REEMIT SURFACE_TRANSMIT BULK_REEMIT   
    //
    //
    209     int out_idx = atomicAdd(output_queue, 1);
    210     output_queue[out_idx] = photon_id;
    //
    //     http://supercomputingblog.com/cuda/cuda-tutorial-4-atomic-operations/
    //
    //         This atomicAdd function can be called within a kernel. When a thread executes this operation, a memory address is read, 
    //         has the value of val added to it, and the result is written back to memory. 
    //         The original value of the memory at location ?address? is returned to the thread.
    //
    211     }
    212 } // propagate




`chroma/cuda/photon.h`
~~~~~~~~~~~~~~~~~~~~~~~~

::

    584 __device__ int
    585 propagate_at_surface(Photon &p, State &s, curandState &rng, Geometry *geometry,
    586                      bool use_weights=false)
    587 {
    588     Surface *surface = geometry->surfaces[s.surface_index];
    589 
    590     if (surface->model == SURFACE_COMPLEX)
    591         return propagate_complex(p, s, rng, surface, use_weights);
    592     else if (surface->model == SURFACE_WLS)
    593         return propagate_at_wls(p, s, rng, surface, use_weights);
    594     else {
    595         // use default surface model: do a combination of specular and
    596         // diffuse reflection, detection, and absorption based on relative
    597         // probabilties



* `chroma/doc/source/surface.rst`