/*
 * Copyright © 2018 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "nir_xfb_info.h"

#include <util/u_math.h>

static void
add_var_xfb_outputs(nir_xfb_info *xfb,
                    nir_variable *var,
                    unsigned *location,
                    unsigned *offset,
                    const struct glsl_type *type)
{
   /* If this type contains a 64-bit value, align to 8 bytes */
   if (glsl_type_contains_64bit(type))
      *offset = ALIGN_POT(*offset, 8);

   if (glsl_type_is_array_or_matrix(type) && !var->data.compact) {
      unsigned length = glsl_get_length(type);
      const struct glsl_type *child_type = glsl_get_array_element(type);
      for (unsigned i = 0; i < length; i++)
         add_var_xfb_outputs(xfb, var, location, offset, child_type);
   } else if (glsl_type_is_struct(type)) {
      unsigned length = glsl_get_length(type);
      for (unsigned i = 0; i < length; i++) {
         const struct glsl_type *child_type = glsl_get_struct_field(type, i);
         add_var_xfb_outputs(xfb, var, location, offset, child_type);
      }
   } else {
      assert(var->data.xfb_buffer < NIR_MAX_XFB_BUFFERS);
      if (xfb->buffers_written & (1 << var->data.xfb_buffer)) {
         assert(xfb->strides[var->data.xfb_buffer] == var->data.xfb_stride);
         assert(xfb->buffer_to_stream[var->data.xfb_buffer] == var->data.stream);
      } else {
         xfb->buffers_written |= (1 << var->data.xfb_buffer);
         xfb->strides[var->data.xfb_buffer] = var->data.xfb_stride;
         xfb->buffer_to_stream[var->data.xfb_buffer] = var->data.stream;
      }

      assert(var->data.stream < NIR_MAX_XFB_STREAMS);
      xfb->streams_written |= (1 << var->data.stream);

      unsigned comp_slots;
      if (var->data.compact) {
         /* This only happens for clip/cull which are float arrays */
         assert(glsl_without_array(type) == glsl_float_type());
         assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
                var->data.location == VARYING_SLOT_CLIP_DIST1);
         comp_slots = glsl_get_length(type);
      } else {
         comp_slots = glsl_get_component_slots(type);

         unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4);
         assert(attrib_slots == glsl_count_attribute_slots(type, false));

         /* Ensure that we don't have, for instance, a dvec2 with a
          * location_frac of 2 which would make it crass a location boundary
          * even though it fits in a single slot.  However, you can have a
          * dvec3 which crosses the slot boundary with a location_frac of 2.
          */
         assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) ==
                attrib_slots);
      }

      assert(var->data.location_frac + comp_slots <= 8);
      uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac;

      while (comp_mask) {
         nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++];

         output->buffer = var->data.xfb_buffer;
         output->offset = *offset;
         output->location = *location;
         output->component_mask = comp_mask & 0xf;

         *offset += util_bitcount(output->component_mask) * 4;
         (*location)++;
         comp_mask >>= 4;
      }
   }
}

static int
compare_xfb_output_offsets(const void *_a, const void *_b)
{
   const nir_xfb_output_info *a = _a, *b = _b;
   return a->offset - b->offset;
}

nir_xfb_info *
nir_gather_xfb_info(const nir_shader *shader, void *mem_ctx)
{
   assert(shader->info.stage == MESA_SHADER_VERTEX ||
          shader->info.stage == MESA_SHADER_TESS_EVAL ||
          shader->info.stage == MESA_SHADER_GEOMETRY);

   /* Compute the number of outputs we have.  This is simply the number of
    * cumulative locations consumed by all the variables.  If a location is
    * represented by multiple variables, then they each count separately in
    * number of outputs.
    */
   unsigned num_outputs = 0;
   nir_foreach_variable(var, &shader->outputs) {
      if (var->data.explicit_xfb_buffer ||
          var->data.explicit_xfb_stride) {
         assert(var->data.explicit_xfb_buffer &&
                var->data.explicit_xfb_stride &&
                var->data.explicit_offset);
         num_outputs += glsl_count_attribute_slots(var->type, false);
      }
   }
   if (num_outputs == 0)
      return NULL;

   nir_xfb_info *xfb = rzalloc_size(mem_ctx, nir_xfb_info_size(num_outputs));

   /* Walk the list of outputs and add them to the array */
   nir_foreach_variable(var, &shader->outputs) {
      if (var->data.explicit_xfb_buffer ||
          var->data.explicit_xfb_stride) {
         unsigned location = var->data.location;
         unsigned offset = var->data.offset;
         add_var_xfb_outputs(xfb, var, &location, &offset, var->type);
      }
   }
   assert(xfb->output_count == num_outputs);

   /* Everything is easier in the state setup code if the list is sorted in
    * order of output offset.
    */
   qsort(xfb->outputs, xfb->output_count, sizeof(xfb->outputs[0]),
         compare_xfb_output_offsets);

   /* Finally, do a sanity check */
   unsigned max_offset[NIR_MAX_XFB_BUFFERS] = {0};
   for (unsigned i = 0; i < xfb->output_count; i++) {
      assert(xfb->outputs[i].offset >= max_offset[xfb->outputs[i].buffer]);
      assert(xfb->outputs[i].component_mask != 0);
      unsigned slots = util_bitcount(xfb->outputs[i].component_mask);
      max_offset[xfb->outputs[i].buffer] = xfb->outputs[i].offset + slots * 4;
   }

   return xfb;
}
