#Multi-threaded cloth simulation is slower than single-threaded

20 messages · Page 1 of 1 (latest)

late falcon
#

I've created a cloth simulation using glium to render the cloth and I'm running into an issue where my single-threaded solution is faster than the multi-threaded threaded version. Here is the single threaded method:

        let mut forces = vec![vec![(0.0, 0.0); self.points[0].len()]; self.points.len()];

        for (i, row) in self.points.iter().enumerate() {
            for (j, point) in row.iter().enumerate() {
                let mut total_force_x = 0.0;
                let mut total_force_y = 0.0;

                //spring and damper
                for spring in &self.springs {
                    let point1 = &self.points[spring.p1.0][spring.p1.1];
                    let point2 = &self.points[spring.p2.0][spring.p2.1];

                    let dx = point2.x - point1.x;
                    let dy = point2.y - point1.y;

                    let distance = (dx * dx + dy * dy).sqrt();
                    let magnitude = spring.spring_coeff * (distance - spring.rest_length);

                    let spring_force_x = (magnitude * dx) / distance;
                    let spring_force_y = (magnitude * dy) / distance;

                    let damping_force_x = -point.vx * spring.damp_coeff;
                    let damping_force_y = -point.vy * spring.damp_coeff;

                    if point1.x == point.x && point1.y == point.y {
                        total_force_x += spring_force_x + damping_force_x;
                        total_force_y += spring_force_y + damping_force_y;
                    } else if point2.x == point.x && point2.y == point.y {
                        total_force_x -= spring_force_x - damping_force_x;
                        total_force_y -= spring_force_y - damping_force_y;
                    }
                }```
#
                let gravity_force_x = 0.0;
                let gravity_force_y = if self.g_on { -self.g * self.m } else { 0.0 };

                //external forces
                let mut rng = rand::thread_rng();
                let ext_force_x = rng.gen_range(-1.0..1.0) * point.ext_m;
                let ext_force_y = rng.gen_range(-1.0..1.0) * point.ext_m;

                //total
                total_force_x += gravity_force_x + ext_force_x;
                total_force_y += gravity_force_y + ext_force_y;

                forces[i][j] = (total_force_x, total_force_y);
            }
        }

for (i, row) in self.points.iter_mut().enumerate() {
            for (j, point) in row.iter_mut().enumerate() {
                if point.fixed {
                    continue;
                }

                let (fx, fy) = forces[i][j];

                //accelaration
                point.ax = fx / self.m;
                point.ay = fy / self.m;

                let prev_x = point.x;
                let prev_y = point.y;
                point.x += point.vx * dt + 0.5 * point.ax * dt * dt;
                point.y += point.vy * dt + 0.5 * point.ay * dt * dt;

                //floor colision
                if point.y < -32.0 {
                    point.y = -32.0;
                    point.vy = 0.0;
                }

                //velocity
                let new_vx = (point.x - prev_x) / dt;
                let new_vy = (point.y - prev_y) / dt;
                point.vx = if point.y == -32.0 { -new_vy } else { new_vx };
                point.vy = if point.y == -32.0 { -new_vy } else { new_vy };
            }
        }
    }```
#

In order to simulate the cloth with multiple threads i use the following function:

pub fn simulate_multithreaded(&mut self, dt: f32) {
        let num_threads = 2 * (*CORE_COUNT as usize);
        // let num_threads = 2;
        let rows_per_thread = self.points.len() / num_threads;

        let mut pool = scoped_threadpool::Pool::new(num_threads as u32);

        let points_arc = Arc::new(Mutex::new(self.points.clone()));

        let m = self.m;
        let g = self.g;
        let g_on = self.g_on;

        pool.scoped(|scope| {
            for i in 0..num_threads {
                let start_row = i * rows_per_thread;
                let end_row = if i == num_threads - 1 {
                    self.points.len()
                } else {
                    (i + 1) * rows_per_thread
                };

                let points_arc = Arc::clone(&points_arc);
                let springs = &self.springs;

                scope.execute(move || {
                    let mut points = points_arc.lock().unwrap();
                    simulate_segment(&mut points, springs, start_row, end_row, dt, m, g, g_on);
                });
            }
        });

        let points = Arc::try_unwrap(points_arc).unwrap().into_inner().unwrap();
        self.points = points;
    }
#

The function to simulate each segment is like so:

fn simulate_segment(
    points: &mut Vec<Vec<Point>>,
    springs: &Vec<Spring>,
    start_row: usize,
    end_row: usize,
    dt: f32,
    m: f32,
    g: f32,
    g_on: bool
) {
    // Calculate forces for the assigned cloth section
    let mut forces = vec![vec![(0.0,0.0); points[0].len()]; end_row - start_row];

    for (i, row) in points[start_row..end_row].iter().enumerate() {
        for (j, point) in row.iter().enumerate() {
            let mut total_force_x = 0.0;
            let mut total_force_y = 0.0;

The rest of the function is the same as the single threaded version but it uses the start to end row instead of going through the entire grid

#

Also here is how i've set up my structs:

#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Point {
    pub x: f32,
    pub y: f32,
    pub vx: f32,
    pub vy: f32,
    pub ax: f32,
    pub ay: f32,
    pub fixed: bool,
    pub ext_m: f32,
}

#[derive(Clone, Copy)]
pub struct Spring {
    pub p1: (usize, usize),
    pub p2: (usize, usize),
    pub rest_length: f32,
    pub spring_coeff: f32,
    pub damp_coeff: f32,
}

#[derive(Clone)]
pub struct Cloth {
    pub points: Vec<Vec<Point>>,
    pub springs: Vec<Spring>,
    pub g: f32,
    pub m: f32,
    pub g_on: bool,
}
coarse swan
#

the most basic problem here is

scope.execute(move || {
    let mut points = points_arc.lock().unwrap();

by taking a single lock, which you hold while doing the computation, you're preventing all parallelism

#

in order to get parallelism you have to have some work you can do while not holding the lock

#

For physics simulations, a good choice is to have two copies of the system state — one "past" which is read, and one "future" which is written. That way, the threads can read the "past" state without locking, and write to different parts of the "future" state, again without locking (there are various ways to do that, but rayon's parallel iterators are often a good choice).

#

(Also, there is always some overhead to concurrency, so you need to make sure that the work you're doing in each task that you distribute to other threads is big enough to be worth it. That's probably not a problem for you judging by how simulate_segment starts, but if you were doing something like just position += velocity, you'd want to do more than one of those per task.)

late falcon
#

oh okay i understand thank you. would you say it is necessary to have 2 copies? or would it just make my life easier / work better?

gray gorge
#

For this you don't have to have 2 data sets since you calculate the force in 1 pass then the position update in the next. I'd do this by having each point know what springs it is attached, then it sums up all the forces on itself in parallel then another parallel pass to update the positions. You can easily partition it as long as the points are calculating their own properties, if you partition on springs you could have an issue with concurrent writing your points across partition boundaries since your springs refer to your points.

For multi-threading think of it as you NEVER want to write across your partition boundary and if you don't you are safe basically 100% of the time.

late falcon
# coarse swan For physics simulations, a good choice is to have two copies of the system state...

im trying to implement the 2 states as well as computing forces and updating of points in separate functions, but every time i try I get borrow check errors and value move errors without using Arc and Mutex.

pub fn simulate_multithreaded(&mut self, dt: f32) {
        let num_threads = 1 * (*CORE_COUNT as u32);
        let rows_per_thread = self.points.len() / (num_threads as usize);
        let mut pool = Pool::new(num_threads);

        let read_cloth = self.clone();
        let mut write_cloth = self.clone();

        let m = self.m;
        let g = self.g;
        let g_on = self.g_on;

        pool.scoped(|scope| {
            for i in 0..num_threads {
                let start_row = (i as usize) * rows_per_thread;
                let end_row = if (i as usize) == (num_threads as usize) - 1 {
                    self.points.len()
                } else {
                    start_row + rows_per_thread
                };

                scope.execute(move || {
                    let forces = compute_forces(
                        &read_cloth.points,
                        &read_cloth.springs,
                        start_row,
                        end_row,
                        m,
                        g,
                        g_on
                    );
                    update_points(&mut write_cloth.points, &forces, start_row, end_row, dt, m);
                });
            }
        });

        self.points = write_cloth.points;
    }
}
coarse swan
#

you cannot do it with just giving each task the same &mut write_cloth.points

#

you have to split the borrow, so the compiler can recognize that what is written to will be disjoint

#

but I strongly recommend you check out rayon; its parallel iterators make it painless to set up splitting like this

late falcon
#

okay thank you so much for your help i will try that

late falcon
coarse swan
#

for this purpose it doesn't matter how you represent a 2D grid — you will have the same problems or not, either way

#

however, you should definitely avoid Vec<Vec<T>> regardless — it is inefficient because it makes many separate allocations (reducing memory locality) and requires following an extra pointer