initial stab at a simd version of per_pixel_triangle_2d

the results are highly disappointing. i am almost certainly going to seriously revisit scanline-based triangle rasterization as a result of this. first off, there's some annoying floating point inaccuracy differences introduced with this change that is throwing some things off slightly (most noticeable in texture sampling). i noticed this manually comparing barycentric coordinates for a small triangle both with and without simd. i noticed that there were small discrepancies. i assume this is the cause, but i am at a loss how this has crept in and how it is so noticeable despite the fact that the differences are very small. ugh. second, the performance improvement is .... negligible. seriously negligible. ONLY the most simple triangle rendering sees some small gains (like solid color, no per-pixel interpolation). to be honest, i kind of expected that i would see no gains in the more advanced triangle rendering (like multicolor+blended, textured+blended and also textured+multicolor+blended) because in those cases the per-pixel operations really seem to dwarf the cost of all of the pixel/coordinate stepping that is happening outside the pixel_fn code. the "fix" here is likely to either use avx / avx512 simd and try to write some pixel_fn code that can calculate multiple pixels simultaneously. this is complicated though because there's yet unanswered questions about how this could be worked in to the existing pixel_fn "architecture" and how to handle cases where only some of the 4 pixels in each step are to be rendered. maybe easy enough, but i suspect this ends up adding more comparisons and branches in the inner loop ... which may just continue to kill any possible performance gains. the other option is to use threads somehow, although if threads are on the table, i'd rather just do parallel row rendering ... that would seem to be significantly simpler and more efficient? however, i do not want to require avx / avx512 support right now. probably avx wouldn't be unreasonable, but i don't believe avx512 is ubiquitous enough yet. i'd ideally want any code i write to be runnable on any x86 machine in the past 10-15 years. this is also what makes even avx questionable in my mind. threading ... i am unsure of. frankly, half the point of this library was to keep things simple in an "old school" kind of way, and threading seems the opposite of that to me. ugh ugh ugh. i fucking hate this.
2023-04-26 18:55:14 -04:00
1 changed files with 213 additions and 30 deletions
--- a/ggdt/src/graphics/bitmap/triangles.rs
+++ b/ggdt/src/graphics/bitmap/triangles.rs
@ -1,5 +1,8 @@
 use std::simd;
 use std::simd::SimdPartialOrd;
 use crate::graphics::{Bitmap, Pixel};
-use crate::math::{NearlyEqual, Rect, Vector2};
+use crate::math::{nearly_equal_simd, NearlyEqual, Rect, Vector2};
 #[inline]
 pub fn edge_function(a: Vector2, b: Vector2, c: Vector2) -> f32 {
@ -26,17 +29,32 @@ struct TriangleEdge {
 	y_inc: f32,
 	is_bottom_right_edge: bool,
 	origin: f32,
 	x_inc_simd: simd::f32x4,
 	y_inc_simd: simd::f32x4,
 	origin_simd: simd::f32x4,
 }
 impl TriangleEdge {
 	pub fn from(v1: Vector2, v2: Vector2, initial_sample_point: Vector2) -> Self {
 		let x_inc = v1.y - v2.y;
 		let y_inc = v2.x - v1.x;
 		let x_inc_simd = simd::f32x4::splat(x_inc * 4.0);
 		let y_inc_simd = simd::f32x4::splat(y_inc);
 		let origin = edge_function(v1, v2, initial_sample_point);
 		let origin_simd = simd::f32x4::from_array([
 			origin, //
 			origin + (x_inc * 2.0),
 			origin + (x_inc * 3.0),
 			origin + (x_inc * 4.0),
 		]);
 		Self {
 			x_inc,
 			y_inc,
 			is_bottom_right_edge: is_bottom_right_edge(v2, v1),
-			origin: edge_function(v1, v2, initial_sample_point),
+			origin,
 			x_inc_simd,
 			y_inc_simd,
 			origin_simd,
 		}
 	}
@ -47,6 +65,11 @@ impl TriangleEdge {
 		value <= 0.0
 	}
 	#[inline]
 	pub fn is_inside_simd(&self, value: simd::f32x4) -> simd::mask32x4 {
 		value.simd_le(simd::f32x4::splat(0.0))
 	}
 	#[inline]
 	pub fn is_on_fill_edge(&self, value: f32) -> bool {
 		// skip bottom-right edge pixels so we only draw pixels inside the triangle as well as those that lie
@ -54,25 +77,207 @@ impl TriangleEdge {
 		!(self.is_bottom_right_edge && value.nearly_equal(0.0, f32::EPSILON))
 	}
 	#[inline]
 	pub fn is_on_fill_edge_simd(&self, value: simd::f32x4) -> simd::mask32x4 {
 		!(self.is_bottom_right_edge & nearly_equal_simd(value, simd::f32x4::splat(0.0), f32::EPSILON))
 	}
 	#[inline]
 	pub fn evaluate(&self, value: f32) -> bool {
 		self.is_inside(value) && self.is_on_fill_edge(value)
 	}
 	#[inline]
 	pub fn evaluate_simd(&self, value: simd::f32x4) -> simd::mask32x4 {
 		self.is_inside_simd(value) & self.is_on_fill_edge_simd(value)
 	}
 	#[inline]
 	pub fn step_x(&self, value: f32) -> f32 {
 		value + self.x_inc
 	}
 	#[inline]
 	pub fn step_x_simd(&self, value: simd::f32x4) -> simd::f32x4 {
 		value + self.x_inc_simd
 	}
 	#[inline]
 	pub fn step_y(&self, value: f32) -> f32 {
 		value + self.y_inc
 	}
 	#[inline]
 	pub fn step_y_simd(&self, value: simd::f32x4) -> simd::f32x4 {
 		value + self.y_inc_simd
 	}
 	#[inline]
 	pub fn origin(&self) -> f32 {
 		self.origin
 	}
 	#[inline]
 	pub fn origin_simd(&self) -> simd::f32x4 {
 		self.origin_simd
 	}
 }
 fn triangle_2d_4x_width<PixelType: Pixel>(
 	dest: &mut Bitmap<PixelType>,
 	edge_bc: TriangleEdge,
 	edge_ca: TriangleEdge,
 	edge_ab: TriangleEdge,
 	bounds: Rect,
 	pixel_fn: impl Fn(&mut PixelType, f32, f32, f32),
 ) {
 	let draw_width = bounds.width as usize;
 	let next_row_inc = dest.width() as usize;
 	let mut pixels = unsafe { dest.pixels_at_mut_ptr_unchecked(bounds.x, bounds.y) };
 	let mut w0_row = edge_bc.origin_simd();
 	let mut w1_row = edge_ca.origin_simd();
 	let mut w2_row = edge_ab.origin_simd();
 	for _ in bounds.y..=bounds.bottom() {
 		let mut w0 = w0_row;
 		let mut w1 = w1_row;
 		let mut w2 = w2_row;
 		let row_pixels = unsafe { std::slice::from_raw_parts_mut(pixels, draw_width) };
 		for x in (0..draw_width).step_by(4) {
 			let mask = edge_bc.evaluate_simd(w0) & edge_ca.evaluate_simd(w1) & edge_ab.evaluate_simd(w2);
 			if mask.any() {
 				if unsafe { mask.test_unchecked(0) } {
 					pixel_fn(unsafe { row_pixels.get_unchecked_mut(x) }, w0[0], w1[0], w2[0]);
 				}
 				if unsafe { mask.test_unchecked(1) } {
 					pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 1) }, w0[1], w1[1], w2[1]);
 				}
 				if unsafe { mask.test_unchecked(2) } {
 					pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 2) }, w0[2], w1[2], w2[2]);
 				}
 				if unsafe { mask.test_unchecked(3) } {
 					pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 3) }, w0[3], w1[3], w2[3]);
 				}
 			}
 			w0 = edge_bc.step_x_simd(w0);
 			w1 = edge_ca.step_x_simd(w1);
 			w2 = edge_ab.step_x_simd(w2);
 		}
 		w0_row = edge_bc.step_y_simd(w0_row);
 		w1_row = edge_ca.step_y_simd(w1_row);
 		w2_row = edge_ab.step_y_simd(w2_row);
 		pixels = unsafe { pixels.add(next_row_inc) };
 	}
 }
 fn triangle_2d_4x_width_and_remainder<PixelType: Pixel>(
 	dest: &mut Bitmap<PixelType>,
 	edge_bc: TriangleEdge,
 	edge_ca: TriangleEdge,
 	edge_ab: TriangleEdge,
 	bounds: Rect,
 	pixel_fn: impl Fn(&mut PixelType, f32, f32, f32),
 ) {
 	let draw_width = bounds.width as usize;
 	let next_row_inc = dest.width() as usize;
 	let mut pixels = unsafe { dest.pixels_at_mut_ptr_unchecked(bounds.x, bounds.y) };
 	let x_remainder_start = draw_width - (draw_width & 3);
 	let mut w0_row = edge_bc.origin_simd();
 	let mut w1_row = edge_ca.origin_simd();
 	let mut w2_row = edge_ab.origin_simd();
 	for _ in bounds.y..=bounds.bottom() {
 		let mut w0 = w0_row;
 		let mut w1 = w1_row;
 		let mut w2 = w2_row;
 		let row_pixels = unsafe { std::slice::from_raw_parts_mut(pixels, draw_width) };
 		for x in (0..draw_width).step_by(4) {
 			let mask = edge_bc.evaluate_simd(w0) & edge_ca.evaluate_simd(w1) & edge_ab.evaluate_simd(w2);
 			if mask.any() {
 				if unsafe { mask.test_unchecked(0) } {
 					pixel_fn(unsafe { row_pixels.get_unchecked_mut(x) }, w0[0], w1[0], w2[0]);
 				}
 				if unsafe { mask.test_unchecked(1) } {
 					pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 1) }, w0[1], w1[1], w2[1]);
 				}
 				if unsafe { mask.test_unchecked(2) } {
 					pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 2) }, w0[2], w1[2], w2[2]);
 				}
 				if unsafe { mask.test_unchecked(3) } {
 					pixel_fn(unsafe { row_pixels.get_unchecked_mut(x + 3) }, w0[3], w1[3], w2[3]);
 				}
 			}
 			w0 = edge_bc.step_x_simd(w0);
 			w1 = edge_ca.step_x_simd(w1);
 			w2 = edge_ab.step_x_simd(w2);
 		}
 		let mut w0 = w0[3];
 		let mut w1 = w1[3];
 		let mut w2 = w2[3];
 		let row_pixels = &mut row_pixels[x_remainder_start..draw_width];
 		for pixel in row_pixels.iter_mut() {
 			if edge_bc.evaluate(w0) && edge_ca.evaluate(w1) && edge_ab.evaluate(w2) {
 				pixel_fn(pixel, w0, w1, w2)
 			}
 			w0 = edge_bc.step_x(w0);
 			w1 = edge_ca.step_x(w1);
 			w2 = edge_ab.step_x(w2);
 		}
 		w0_row = edge_bc.step_y_simd(w0_row);
 		w1_row = edge_ca.step_y_simd(w1_row);
 		w2_row = edge_ab.step_y_simd(w2_row);
 		pixels = unsafe { pixels.add(next_row_inc) };
 	}
 }
 fn triangle_2d_any_width<PixelType: Pixel>(
 	dest: &mut Bitmap<PixelType>,
 	edge_bc: TriangleEdge,
 	edge_ca: TriangleEdge,
 	edge_ab: TriangleEdge,
 	bounds: Rect,
 	pixel_fn: impl Fn(&mut PixelType, f32, f32, f32),
 ) {
 	let draw_width = bounds.width as usize;
 	let next_row_inc = dest.width() as usize;
 	let mut pixels = unsafe { dest.pixels_at_mut_ptr_unchecked(bounds.x, bounds.y) };
 	let mut w0_row = edge_bc.origin();
 	let mut w1_row = edge_ca.origin();
 	let mut w2_row = edge_ab.origin();
 	for _ in bounds.y..=bounds.bottom() {
 		let mut w0 = w0_row;
 		let mut w1 = w1_row;
 		let mut w2 = w2_row;
 		let row_pixels = unsafe { std::slice::from_raw_parts_mut(pixels, draw_width) };
 		for pixel in row_pixels.iter_mut() {
 			if edge_bc.evaluate(w0) && edge_ca.evaluate(w1) && edge_ab.evaluate(w2) {
 				pixel_fn(pixel, w0, w1, w2)
 			}
 			w0 = edge_bc.step_x(w0);
 			w1 = edge_ca.step_x(w1);
 			w2 = edge_ab.step_x(w2);
 		}
 		w0_row = edge_bc.step_y(w0_row);
 		w1_row = edge_ca.step_y(w1_row);
 		w2_row = edge_ab.step_y(w2_row);
 		pixels = unsafe { pixels.add(next_row_inc) };
 	}
 }
 #[inline]
@ -103,33 +308,11 @@ pub fn per_pixel_triangle_2d<PixelType: Pixel>(
 	let edge_ca = TriangleEdge::from(c, a, p);
 	let edge_ab = TriangleEdge::from(a, b, p);
-	let mut w0_row = edge_bc.origin();
+	if bounds.width % 4 == 0 {
-	let mut w1_row = edge_ca.origin();
+		triangle_2d_4x_width(dest, edge_bc, edge_ca, edge_ab, bounds, pixel_fn);
-	let mut w2_row = edge_ab.origin();
+	} else if bounds.width > 4 {
-
+		triangle_2d_4x_width_and_remainder(dest, edge_bc, edge_ca, edge_ab, bounds, pixel_fn);
-	let draw_width = bounds.width as usize;
+	} else {
-	let next_row_inc = dest.width() as usize;
+		triangle_2d_any_width(dest, edge_bc, edge_ca, edge_ab, bounds, pixel_fn);
 	let mut pixels = unsafe { dest.pixels_at_mut_ptr_unchecked(bounds.x, bounds.y) };
 	for _ in bounds.y..=bounds.bottom() {
 		let mut w0 = w0_row;
 		let mut w1 = w1_row;
 		let mut w2 = w2_row;
 		let row_pixels = unsafe { std::slice::from_raw_parts_mut(pixels, draw_width) };
 		for pixel in row_pixels.iter_mut() {
 			if edge_bc.evaluate(w0) && edge_ca.evaluate(w1) && edge_ab.evaluate(w2) {
 				pixel_fn(pixel, w0, w1, w2)
 			}
 			w0 = edge_bc.step_x(w0);
 			w1 = edge_ca.step_x(w1);
 			w2 = edge_ab.step_x(w2);
 		}
 		w0_row = edge_bc.step_y(w0_row);
 		w1_row = edge_ca.step_y(w1_row);
 		w2_row = edge_ab.step_y(w2_row);
 		pixels = unsafe { pixels.add(next_row_inc) };
 	}
 }