Compare commits

...

1 commit

Author SHA1 Message Date
Gered 07b093ee39 initial stab at a simd version of per_pixel_triangle_2d
the results are highly disappointing. i am almost certainly going to
seriously revisit scanline-based triangle rasterization as a result of
this.

first off, there's some annoying floating point inaccuracy differences
introduced with this change that is throwing some things off slightly
(most noticeable in texture sampling). i noticed this manually comparing
barycentric coordinates for a small triangle both with and without
simd. i noticed that there were small discrepancies. i assume this is
the cause, but i am at a loss how this has crept in and how it is so
noticeable despite the fact that the differences are very small. ugh.

second, the performance improvement is .... negligible. seriously
negligible. ONLY the most simple triangle rendering sees some small
gains (like solid color, no per-pixel interpolation). to be honest,
i kind of expected that i would see no gains in the more advanced
triangle rendering (like multicolor+blended, textured+blended and also
textured+multicolor+blended) because in those cases the per-pixel
operations really seem to dwarf the cost of all of the pixel/coordinate
stepping that is happening outside the pixel_fn code. the "fix" here
is likely to either use avx / avx512 simd and try to write some
pixel_fn code that can calculate multiple pixels simultaneously. this is
complicated though because there's yet unanswered questions about how
this could be worked in to the existing pixel_fn "architecture" and how
to handle cases where only some of the 4 pixels in each step are
to be rendered. maybe easy enough, but i suspect this ends up adding
more comparisons and branches in the inner loop ... which may just
continue to kill any possible performance gains. the other option is
to use threads somehow, although if threads are on the table, i'd
rather just do parallel row rendering ... that would seem to be
significantly simpler and more efficient?

however, i do not want to require avx / avx512 support right now.
probably avx wouldn't be unreasonable, but i don't believe avx512 is
ubiquitous enough yet. i'd ideally want any code i write to be runnable
on any x86 machine in the past 10-15 years. this is also what makes
even avx questionable in my mind.

threading ... i am unsure of. frankly, half the point of this library
was to keep things simple in an "old school" kind of way, and threading
seems the opposite of that to me.

ugh ugh ugh. i fucking hate this.
2023-04-26 18:55:14 -04:00

View file

@ -1,5 +1,8 @@
use std::simd;
use std::simd::SimdPartialOrd;
use crate::graphics::{Bitmap, Pixel}; use crate::graphics::{Bitmap, Pixel};
use crate::math::{NearlyEqual, Rect, Vector2}; use crate::math::{nearly_equal_simd, NearlyEqual, Rect, Vector2};
#[inline] #[inline]
pub fn edge_function(a: Vector2, b: Vector2, c: Vector2) -> f32 { pub fn edge_function(a: Vector2, b: Vector2, c: Vector2) -> f32 {
@ -26,17 +29,32 @@ struct TriangleEdge {
y_inc: f32, y_inc: f32,
is_bottom_right_edge: bool, is_bottom_right_edge: bool,
origin: f32, origin: f32,
x_inc_simd: simd::f32x4,
y_inc_simd: simd::f32x4,
origin_simd: simd::f32x4,
} }
impl TriangleEdge { impl TriangleEdge {
pub fn from(v1: Vector2, v2: Vector2, initial_sample_point: Vector2) -> Self { pub fn from(v1: Vector2, v2: Vector2, initial_sample_point: Vector2) -> Self {
let x_inc = v1.y - v2.y; let x_inc = v1.y - v2.y;
let y_inc = v2.x - v1.x; let y_inc = v2.x - v1.x;
let x_inc_simd = simd::f32x4::splat(x_inc * 4.0);
let y_inc_simd = simd::f32x4::splat(y_inc);
let origin = edge_function(v1, v2, initial_sample_point);
let origin_simd = simd::f32x4::from_array([
origin, //
origin + (x_inc * 2.0),
origin + (x_inc * 3.0),
origin + (x_inc * 4.0),
]);
Self { Self {
x_inc, x_inc,
y_inc, y_inc,
is_bottom_right_edge: is_bottom_right_edge(v2, v1), is_bottom_right_edge: is_bottom_right_edge(v2, v1),
origin: edge_function(v1, v2, initial_sample_point), origin,
x_inc_simd,
y_inc_simd,
origin_simd,
} }
} }
@ -47,6 +65,11 @@ impl TriangleEdge {
value <= 0.0 value <= 0.0
} }
#[inline]
pub fn is_inside_simd(&self, value: simd::f32x4) -> simd::mask32x4 {
value.simd_le(simd::f32x4::splat(0.0))
}
#[inline] #[inline]
pub fn is_on_fill_edge(&self, value: f32) -> bool { pub fn is_on_fill_edge(&self, value: f32) -> bool {
// skip bottom-right edge pixels so we only draw pixels inside the triangle as well as those that lie // skip bottom-right edge pixels so we only draw pixels inside the triangle as well as those that lie
@ -54,25 +77,207 @@ impl TriangleEdge {
!(self.is_bottom_right_edge && value.nearly_equal(0.0, f32::EPSILON)) !(self.is_bottom_right_edge && value.nearly_equal(0.0, f32::EPSILON))
} }
#[inline]
pub fn is_on_fill_edge_simd(&self, value: simd::f32x4) -> simd::mask32x4 {
!(self.is_bottom_right_edge & nearly_equal_simd(value, simd::f32x4::splat(0.0), f32::EPSILON))
}
#[inline] #[inline]
pub fn evaluate(&self, value: f32) -> bool { pub fn evaluate(&self, value: f32) -> bool {
self.is_inside(value) && self.is_on_fill_edge(value) self.is_inside(value) && self.is_on_fill_edge(value)
} }
#[inline]
pub fn evaluate_simd(&self, value: simd::f32x4) -> simd::mask32x4 {
self.is_inside_simd(value) & self.is_on_fill_edge_simd(value)
}
#[inline] #[inline]
pub fn step_x(&self, value: f32) -> f32 { pub fn step_x(&self, value: f32) -> f32 {
value + self.x_inc value + self.x_inc
} }
#[inline]
pub fn step_x_simd(&self, value: simd::f32x4) -> simd::f32x4 {
value + self.x_inc_simd
}
#[inline] #[inline]
pub fn step_y(&self, value: f32) -> f32 { pub fn step_y(&self, value: f32) -> f32 {
value + self.y_inc value + self.y_inc
} }
#[inline]
pub fn step_y_simd(&self, value: simd::f32x4) -> simd::f32x4 {
value + self.y_inc_simd
}
#[inline] #[inline]
pub fn origin(&self) -> f32 { pub fn origin(&self) -> f32 {
self.origin self.origin
} }
#[inline]
pub fn origin_simd(&self) -> simd::f32x4 {
self.origin_simd
}
}
fn triangle_2d_4x_width<PixelType: Pixel>(
dest: &mut Bitmap<PixelType>,
edge_bc: TriangleEdge,
edge_ca: TriangleEdge,
edge_ab: TriangleEdge,
bounds: Rect,
pixel_fn: impl Fn(&mut PixelType, f32, f32, f32),
) {
let draw_width = bounds.width as usize;
let next_row_inc = dest.width() as usize;
let mut pixels = unsafe { dest.pixels_at_mut_ptr_unchecked(bounds.x, bounds.y) };
let mut w0_row = edge_bc.origin_simd();
let mut w1_row = edge_ca.origin_simd();
let mut w2_row = edge_ab.origin_simd();
for _ in bounds.y..=bounds.bottom() {
let mut w0 = w0_row;
let mut w1 = w1_row;
let mut w2 = w2_row;
let row_pixels = unsafe { std::slice::from_raw_parts_mut(pixels, draw_width) };
for x in (0..draw_width).step_by(4) {
let mask = edge_bc.evaluate_simd(w0) & edge_ca.evaluate_simd(w1) & edge_ab.evaluate_simd(w2);
if mask.any() {
if unsafe { mask.test_unchecked(0) } {
pixel_fn(unsafe { row_pixels.get_unchecked_mut(x) }, w0[0], w1[0], w2[0]);
}
if unsafe { mask.test_unchecked(1) } {
pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 1) }, w0[1], w1[1], w2[1]);
}
if unsafe { mask.test_unchecked(2) } {
pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 2) }, w0[2], w1[2], w2[2]);
}
if unsafe { mask.test_unchecked(3) } {
pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 3) }, w0[3], w1[3], w2[3]);
}
}
w0 = edge_bc.step_x_simd(w0);
w1 = edge_ca.step_x_simd(w1);
w2 = edge_ab.step_x_simd(w2);
}
w0_row = edge_bc.step_y_simd(w0_row);
w1_row = edge_ca.step_y_simd(w1_row);
w2_row = edge_ab.step_y_simd(w2_row);
pixels = unsafe { pixels.add(next_row_inc) };
}
}
fn triangle_2d_4x_width_and_remainder<PixelType: Pixel>(
dest: &mut Bitmap<PixelType>,
edge_bc: TriangleEdge,
edge_ca: TriangleEdge,
edge_ab: TriangleEdge,
bounds: Rect,
pixel_fn: impl Fn(&mut PixelType, f32, f32, f32),
) {
let draw_width = bounds.width as usize;
let next_row_inc = dest.width() as usize;
let mut pixels = unsafe { dest.pixels_at_mut_ptr_unchecked(bounds.x, bounds.y) };
let x_remainder_start = draw_width - (draw_width & 3);
let mut w0_row = edge_bc.origin_simd();
let mut w1_row = edge_ca.origin_simd();
let mut w2_row = edge_ab.origin_simd();
for _ in bounds.y..=bounds.bottom() {
let mut w0 = w0_row;
let mut w1 = w1_row;
let mut w2 = w2_row;
let row_pixels = unsafe { std::slice::from_raw_parts_mut(pixels, draw_width) };
for x in (0..draw_width).step_by(4) {
let mask = edge_bc.evaluate_simd(w0) & edge_ca.evaluate_simd(w1) & edge_ab.evaluate_simd(w2);
if mask.any() {
if unsafe { mask.test_unchecked(0) } {
pixel_fn(unsafe { row_pixels.get_unchecked_mut(x) }, w0[0], w1[0], w2[0]);
}
if unsafe { mask.test_unchecked(1) } {
pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 1) }, w0[1], w1[1], w2[1]);
}
if unsafe { mask.test_unchecked(2) } {
pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 2) }, w0[2], w1[2], w2[2]);
}
if unsafe { mask.test_unchecked(3) } {
pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 3) }, w0[3], w1[3], w2[3]);
}
}
w0 = edge_bc.step_x_simd(w0);
w1 = edge_ca.step_x_simd(w1);
w2 = edge_ab.step_x_simd(w2);
}
let mut w0 = w0[3];
let mut w1 = w1[3];
let mut w2 = w2[3];
let row_pixels = &mut row_pixels[x_remainder_start..draw_width];
for pixel in row_pixels.iter_mut() {
if edge_bc.evaluate(w0) && edge_ca.evaluate(w1) && edge_ab.evaluate(w2) {
pixel_fn(pixel, w0, w1, w2)
}
w0 = edge_bc.step_x(w0);
w1 = edge_ca.step_x(w1);
w2 = edge_ab.step_x(w2);
}
w0_row = edge_bc.step_y_simd(w0_row);
w1_row = edge_ca.step_y_simd(w1_row);
w2_row = edge_ab.step_y_simd(w2_row);
pixels = unsafe { pixels.add(next_row_inc) };
}
}
fn triangle_2d_any_width<PixelType: Pixel>(
dest: &mut Bitmap<PixelType>,
edge_bc: TriangleEdge,
edge_ca: TriangleEdge,
edge_ab: TriangleEdge,
bounds: Rect,
pixel_fn: impl Fn(&mut PixelType, f32, f32, f32),
) {
let draw_width = bounds.width as usize;
let next_row_inc = dest.width() as usize;
let mut pixels = unsafe { dest.pixels_at_mut_ptr_unchecked(bounds.x, bounds.y) };
let mut w0_row = edge_bc.origin();
let mut w1_row = edge_ca.origin();
let mut w2_row = edge_ab.origin();
for _ in bounds.y..=bounds.bottom() {
let mut w0 = w0_row;
let mut w1 = w1_row;
let mut w2 = w2_row;
let row_pixels = unsafe { std::slice::from_raw_parts_mut(pixels, draw_width) };
for pixel in row_pixels.iter_mut() {
if edge_bc.evaluate(w0) && edge_ca.evaluate(w1) && edge_ab.evaluate(w2) {
pixel_fn(pixel, w0, w1, w2)
}
w0 = edge_bc.step_x(w0);
w1 = edge_ca.step_x(w1);
w2 = edge_ab.step_x(w2);
}
w0_row = edge_bc.step_y(w0_row);
w1_row = edge_ca.step_y(w1_row);
w2_row = edge_ab.step_y(w2_row);
pixels = unsafe { pixels.add(next_row_inc) };
}
} }
#[inline] #[inline]
@ -103,33 +308,11 @@ pub fn per_pixel_triangle_2d<PixelType: Pixel>(
let edge_ca = TriangleEdge::from(c, a, p); let edge_ca = TriangleEdge::from(c, a, p);
let edge_ab = TriangleEdge::from(a, b, p); let edge_ab = TriangleEdge::from(a, b, p);
let mut w0_row = edge_bc.origin(); if bounds.width % 4 == 0 {
let mut w1_row = edge_ca.origin(); triangle_2d_4x_width(dest, edge_bc, edge_ca, edge_ab, bounds, pixel_fn);
let mut w2_row = edge_ab.origin(); } else if bounds.width > 4 {
triangle_2d_4x_width_and_remainder(dest, edge_bc, edge_ca, edge_ab, bounds, pixel_fn);
let draw_width = bounds.width as usize; } else {
let next_row_inc = dest.width() as usize; triangle_2d_any_width(dest, edge_bc, edge_ca, edge_ab, bounds, pixel_fn);
let mut pixels = unsafe { dest.pixels_at_mut_ptr_unchecked(bounds.x, bounds.y) };
for _ in bounds.y..=bounds.bottom() {
let mut w0 = w0_row;
let mut w1 = w1_row;
let mut w2 = w2_row;
let row_pixels = unsafe { std::slice::from_raw_parts_mut(pixels, draw_width) };
for pixel in row_pixels.iter_mut() {
if edge_bc.evaluate(w0) && edge_ca.evaluate(w1) && edge_ab.evaluate(w2) {
pixel_fn(pixel, w0, w1, w2)
}
w0 = edge_bc.step_x(w0);
w1 = edge_ca.step_x(w1);
w2 = edge_ab.step_x(w2);
}
w0_row = edge_bc.step_y(w0_row);
w1_row = edge_ca.step_y(w1_row);
w2_row = edge_ab.step_y(w2_row);
pixels = unsafe { pixels.add(next_row_inc) };
} }
} }