/ani/mrses : revision 1

To get this branch, use:

bzr branch
http://suren.me/webbzr/ani/mrses

« back to all changes in this revision

Viewing changes to cell/vec_potrf.c

Committer: Suren A. Chilingaryan
Date: 2010-04-28 04:30:08 UTC
Revision ID: csa@dside.dyndns.org-20100428043008-vd9z0nso9axezvlp

Initial import

files added:

README

VERSION

bmc.m

cell

cell/Makefile

cell/Makefile.in

cell/README

cell/atlas_potrf.c

cell/atlas_potrf.h

cell/buildutils

cell/buildutils/README_build_env.txt

cell/buildutils/cellsdk_select_compiler

cell/buildutils/make.env

cell/buildutils/make.footer

cell/buildutils/make.header

cell/ext

cell/ext/blas.h

cell/ext/cblas.h

cell/ext/lapack.h

cell/ext/lapack_errno.h

cell/hw_sched.c

cell/hw_sched.h

cell/hw_thread.c

cell/hw_thread.h

cell/mrses.h

cell/mrses_hw.c

cell/mrses_impl.c

cell/mrses_impl.h

cell/mrses_ppu.c

cell/mrses_ppu.h

cell/mrses_spe.c

cell/mrses_spe.h

cell/mrses_spe.txt

cell/mrses_spu.c

cell/mrses_spu.h

cell/msg.c

cell/msg.h

cell/ppu

cell/ppu/Makefile

cell/spu

cell/spu/Makefile

cell/test

cell/test.c

cell/test/Makefile

cell/tools.c

cell/tools.h

cell/vec_potrf.c

cell/vec_potrf.h

cell/vec_potrf.pl

cell/vec_potrf_mtxmul.h

libs

libs/ppu

libs/ppu/blas_LINUX.a

libs/ppu/lapack_LINUX.a

libs/ppu/libgslblas.a

libs/ppu/libgslcblas.a

libs/spu

libs/spu.txt

libs/spu/blas_SPE.a

libs/spu/lapack_SPE.a

libs/spu/libgslblas.a

libs/spu/libgslcblas.a

mrses.m

mrses_hw_debug.m

mrses_hw_distance.m

mrses_mtx.m

mrses_orig.m

mrses_software.m

release.sh

scripts

scripts/mrses_install.sh

test.m

test.sh

Show diffs side-by-side

added added

removed removed

cell/vec_potrf.c

#include <stdio.h>

#include <string.h>

#include <assert.h>

#include <blas_s.h>

#include <simdmath.h>

#include <libvector.h>

#include <simdmath/sqrtf4.h>

#include <simdmath/fmaxf4.h>

#include <simdmath/divf4.h>

#include <simdmath/divf4_fast.h>

#include <simdmath/rsqrtf4.h>

#include <simdmath/recipf4.h>

#include <simdmath/recipf4_fast.h>

#include <simdmath/logf4.h>

#ifdef __SPU__

#include <spu_intrinsics.h>

#else /* not __SPU__ */

#include <altivec.h>

#endif

static vector float zero = {0, 0, 0, 0};

static vector float two = {2, 2, 2, 2};

static vector float four = {4, 4, 4, 4};

static vector float eight = {8, 8, 8, 8};

// fast versions does not bring significant performance benefits

// fmaxf between nan and zero is zero

#define spu_max _fmaxf4

// divide by zero: nan, nan; in fast mode -0, -0

#define spu_div _divf4

//#define spu_div _divf4_fast

// sqrt of negative: 0; in fast mode: undefined, sqrt of nan: big num, ?

#define spu_sqrt(a) _sqrtf4(spu_max(a, zero))

//#define spu_sqrt(a) sqrtf4_fast(spu_max(a, zero))

#define spu_recip _recipf4

//#define spu_recip _recipf4_fast

#define spu_rsqrt _rsqrtf4

#define spu_log _logf4

#define VECTOR_PRINT(var, val) \

printf("Vector %10s is: {% 6.4e, % 6.4e, % 6.4e, % 6.4e}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));

#define iVECTOR_PRINT(var, val) \

printf("Vector %10s is: {%i, %i, %i, %i}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));

#define VECTOR_PRINT_MATRIX(num, fmt, MATRIX, width, n, m) \

{ \

int i, j; \

for (i = 0; i < n; i++) { \

for (j = 0; j < m; j++) \

printf(fmt, spu_extract(MATRIX[i * width + j], num)); \

printf("\n"); \

} \

printf("\n"); \

}

inline static vector unsigned int vec_potrfU_4(vector float *A, const short int lda)

{

vector float *pA1=A+lda, *pA2=pA1+lda, *pA3=pA2+lda;

vector float L11 = *A, L21 = *pA1, L31 = *pA2, L41 = *pA3;

vector float L22 = pA1[1], L32 = pA2[1], L42 = pA3[1];

vector float L33 = pA2[2], L43 = pA3[2];

vector float L44 = pA3[3];

vector unsigned int errors;

errors = spu_cmpgt(L11, zero);

L11 = spu_rsqrt(L11);

*A = spu_recip(L11);

L21 = spu_mul(L21, L11);

L31 = spu_mul(L31, L11);

L41 = spu_mul(L41, L11);

*pA1 = L21;

*pA2 = L31;

*pA3 = L41;

L22 = spu_nmsub(L21, L21, L22); // L22 -= L21*L21;

errors = spu_and(spu_cmpgt(L22, zero), errors);

L22 = spu_rsqrt(L22);

pA1[1] = spu_recip(L22);

L32 = spu_mul(L22, spu_nmsub(L31, L21, L32)); // (L32 - L31*L21) * L22;

L42 = spu_mul(L22, spu_nmsub(L41, L21, L42)); // (L42 - L41*L21) * L22;

L33 = spu_nmsub(L32, L32, spu_nmsub(L31, L31, L33)); // (L33 - L31*L31) - L32*L32;

100

101

pA2[1] = L32;

102

pA3[1] = L42;

103

104

errors = spu_and(spu_cmpgt(L33, zero), errors);

105

106

L33 = spu_rsqrt(L33);

107

pA2[2] = spu_recip(L33);

108

109

L43 = spu_mul(L33, spu_nmsub(L42, L32, spu_nmsub(L41, L31, L43))); // ((L43 - L41*L31) - L42*L32) * L33;

110

L44 = spu_nmsub(L43, L43, spu_nmsub(L42, L42, spu_nmsub(L41, L41, L44))); // (((L44 - L41*L41) - L42*L42) - L43*L43);

111

pA3[2] = L43;

112

113

errors = spu_and(spu_cmpgt(L44, zero), errors);

114

pA3[3] = spu_sqrt(L44);

115

116

return errors;

117

}

118

119

120

inline static vector unsigned int vec_potrfU_3(vector float *A, const short int lda)

121

{

122

vector float *pA1=A+lda, *pA2=pA1+lda;

123

124

125

126

127

vector unsigned int errors;

128

129

errors = spu_cmpgt(L11, zero);

130

131

L11 = spu_rsqrt(L11);

132

*A = spu_recip(L11);

133

134

L21 = spu_mul(L21, L11);

135

L31 = spu_mul(L31, L11);

136

137

*pA1 = L21;

138

*pA2 = L31;

139

140

L22 = spu_nmsub(L21, L21, L22);

141

142

errors = spu_and(spu_cmpgt(L22, zero), errors);

143

144

L22 = spu_rsqrt(L22);

145

L32 = spu_mul( spu_nmsub(L31, L21, L32), L22); // (L32 - L31*L21) / sqrt(L22);

146

147

L33 = spu_nmsub(L32, L32, spu_nmsub(L31, L31, L33)); // (L33 - L31*L31) - L32 * L32;

148

149

errors = spu_and(spu_cmpgt(L33, zero), errors);

150

151

pA1[1] = spu_recip(L22);

152

pA2[1] = L32;

153

pA2[2] = spu_sqrt(L33);

154

155

return errors;

156

}

157

158

159

inline static vector unsigned int vec_potrfU_2(vector float *A, const short int lda)

160

{

161

vector float *pA1 = A + lda;

162

163

164

165

166

errors = spu_cmpgt(L11, zero);

167

168

L11 = spu_rsqrt(L11);

169

*A = spu_recip(L11);

170

*pA1 = L21 = spu_mul(L21, L11); // division by zero

171

172

L22 = spu_nmsub(L21, L21, L22); // L22 -= L21*L21;

173

174

errors = spu_and(spu_cmpgt(L22, zero), errors);

175

pA1[1] = spu_sqrt(L22);

176

177

return errors;

178

}

179

180

inline static void vec_strsv_rlnn (short int N, const vector float *A, const int lda, vector float *X) {

181

182

183

for (i = 0; i < N; i++) {

184

vector float tmp = X[i];

185

186

for (j = 0; j < i; j++) {

187

tmp = spu_nmsub(A[lda * i + j], X[j], tmp); // tmp -= A[lda * i + j] * X[j];

188

}

189

X[i] = spu_div(tmp, A[(lda + 1) * i]); // X[i] = tmp / A[lda * i + i];

190

}

191

}

192

193

#define BLOCK_X 4

194

#define BLOCK_Y 8

195

196

#include <sum_across_float4.h>

197

#include "vec_potrf_mtxmul.h"

198

199

200

#define HSUM(r1,r2,r3,r4) \

201

tmp1 = spu_rlqwbyte(r1, 8); \

202

tmp2 = spu_rlqwbyte(r3, 8); \

203

204

tmp3 = spu_sel(r1, r3, mask1); \

205

tmp4 = spu_sel(tmp1, tmp2, mask1); \

206

tmp5 = spu_add(tmp3, tmp4); \

207

208

tmp1 = spu_rlqwbyte(r2, 8); \

209

tmp2 = spu_rlqwbyte(r4, 8); \

210

211

tmp3 = spu_sel(r2, r4, mask1); \

212

tmp4 = spu_sel(tmp1, tmp2, mask1); \

213

tmp6 = spu_add(tmp3, tmp4); \

214

215

tmp1 = spu_rlqwbyte(tmp5, 4); \

216

tmp2 = spu_rlqwbyte(tmp6, -4); \

217

218

tmp3 = spu_sel(tmp5, tmp6, mask2); \

219

tmp4 = spu_sel(tmp1, tmp2, mask2); \

220

221

tmp5 = spu_add(tmp3, tmp4);

222

223

224

#define DO_MATRIX(size) \

225

DECLARE_R##size \

226

for (k = 0; k < K; k += BLOCK_X) { \

227

const vector float *B = A + k; \

228

229

DECLARE_T##size(B) \

230

COMPUTE_T##size(C) \

231

} \

232

SUM_T##size(C)

233

234

235

#include "tools.h"

236

void vec_ssyrk_rln_11 (short int N, short int K, const vector float *A, short int lda, float *C, short int ldc)

237

{

238

239

240

241

242

243

244

assert((K%16)==0);

245

246

K >>= 2;

247

lda >>= 2;

248

249

if (N < 5) {

250

if (N < 3) {

251

if (N < 2) {

252

DO_MATRIX(1)

253

} else {

254

DO_MATRIX(2)

255

}

256

} else {

257

if (N < 4) {

258

DO_MATRIX(3)

259

} else {

260

DO_MATRIX(4)

261

}

262

}

263

} else if (N < 9) {

264

if (N < 7) {

265

if (N < 6) {

266

DO_MATRIX(5)

267

} else {

268

DO_MATRIX(6)

269

}

270

} else {

271

if (N < 8) {

272

DO_MATRIX(7)

273

} else {

274

DO_MATRIX(8)

275

}

276

}

277

} else if (N < 12) {

278

if (N < 11) {

279

if (N < 10) {

280

DO_MATRIX(9)

281

} else {

282

DO_MATRIX(10)

283

}

284

} else {

285

if (N < 12) {

286

DO_MATRIX(11);

287

} else {

288

DO_MATRIX(12);

289

}

290

}

291

} else {

292

// assert((N%16)==0);

293

// ldc is walloc, we can have N=ralloc smaller for memory optimization, but still should use walloc here to have multiple of 16

294

memset(C, 0, N * ldc * sizeof(float));

295

ssyrk_spu((float*)A, C, 1, /*N*/ldc, K * 4, lda * 4, ldc);

296

}

297

298

299

int i,j;

300

for (i = 0; i < N; i++) {

301

for (j = 0; j <= i; j++) {

302

303

for (k = 0; k < K; k++) {

304

temp = spu_madd(A[i * lda + k], A[j * lda + k], temp);

305

}

306

C[i * ldc + j] = spu_extract(temp, 0) + spu_extract(temp, 1) + spu_extract(temp, 2) + spu_extract(temp, 3);

307

}

308

}

309

310

}

311

312

313

#define DO_ROW(size) \

314

DECLARE_VR##size \

315

for (k = 0; k < K; k += BLOCK_X) { \

316

const vector float *B = A + row * lda + k; \

317

318

DECLARE_VX(B) \

319

COMPUTE_V##size((A + l * lda + k)) \

320

} \

321

SUM_V##size(((vector float*)(C+l)));

322

323

324

void vec_update_row (short int row, short int N, short int K, const vector float *A, short int lda, float *C) {

325

int k, l;

326

327

328

329

330

// vector float temp;

331

332

assert((K%16)==0);

333

334

K >>= 2;

335

lda >>= 2;

336

337

for (l = 0; l < N; l+=16) {

338

short int rem = min(16, N - l);

339

340

if (rem < 9) {

341

if (rem < 5) {

342

if (rem < 3) {

343

if (rem < 2) {

344

DO_ROW(1)

345

} else {

346

DO_ROW(2)

347

}

348

} else {

349

if (rem < 4) {

350

DO_ROW(3)

351

} else {

352

DO_ROW(4)

353

}

354

}

355

} else

356

if (rem < 7) {

357

if (rem < 6) {

358

DO_ROW(5)

359

} else {

360

DO_ROW(6)

361

}

362

} else {

363

if (rem < 8) {

364

DO_ROW(7)

365

} else {

366

DO_ROW(8)

367

}

368

}

369

} else {

370

if (rem < 12) {

371

if (rem < 11) {

372

if (rem < 10) {

373

DO_ROW(9)

374

} else {

375

DO_ROW(10)

376

}

377

} else {

378

if (rem < 12) {

379

DO_ROW(11);

380

} else {

381

DO_ROW(12);

382

}

383

}

384

} else {

385

if (rem < 15) {

386

if (rem < 14) {

387

DO_ROW(13)

388

} else {

389

DO_ROW(14)

390

}

391

} else {

392

if (rem < 16) {

393

DO_ROW(15);

394

} else {

395

DO_ROW(16);

396

}

397

}

398

}

399

}

400

}

401

402

403

404

DECLARE_VR5

405

for (k = 0; k < K; k+= BLOCK_X) {

406

const vector float *B = A + row * lda + k;

407

408

DECLARE_VX(B)

409

COMPUTE_V5((A + l * lda + k))

410

}

411

SUM_V5(((vector float*)(C+l)));

412

413

414

415

416

vector float temp[N];

417

memset(temp, 0, sizeof(vector float)*N);

418

for (k = 0; k < K; k++) {

419

420

421

for (l = 0; l < N; l++) {

422

temp[l] = spu_madd(Arow, A[l * lda + k], temp[l]);

423

C[l] = spu_extract(temp[l], 0) + spu_extract(temp[l], 1) + spu_extract(temp[l], 2) + spu_extract(temp[l], 3);

424

}

425

if ((k%4)==3) {

426

printf("Step: %i\n",k);

427

PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);

428

}

429

}

430

for (l = 0; l < N; l++) {

431

C[l] = spu_extract(temp[l], 0) + spu_extract(temp[l], 1) + spu_extract(temp[l], 2) + spu_extract(temp[l], 3);

432

}

433

PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);

434

435

436

437

for (l = 0; l < N; l++) {

438

if (l < row) {

439

i = l; j = row;

440

} else {

441

i = row; j = l;

442

}

443

444

445

for (k = 0; k < K; k++) {

446

tmp = spu_madd(A[i * lda + k], A[j * lda + k], tmp);

447

}

448

C[l] = spu_extract(tmp, 0) + spu_extract(tmp, 1) + spu_extract(tmp, 2) + spu_extract(tmp, 3);

449

}

450

451

PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);

452

453

}

454

455

456

inline static void vec_spotrf_step(const short int M, const short int N, vector float *A, const short int lda) {

457

458

459

vector float *B = A + M * lda;

460

vector float *C = B + M;

461

462

463

for (i = 0; i < N; i++) {

464

for (j = 0; j < M; j++) {

465

vector float sum = {0, 0, 0, 0};

466

467

for (k = 0; k < j; k++) {

468

sum = spu_madd(A[j * lda + k], B[i * lda + k], sum);

469

}

470

471

B[i * lda + j] = spu_div(spu_sub(B[i * lda + j], sum), A[j * lda + j]);

472

}

473

474

for (j = 0; j <= i; j++) {

475

vector float temp = {0, 0, 0, 0};

476

477

for (k = 0; k < M; k++) {

478

temp = spu_madd(B[i * lda + k], B[j * lda + k], temp);

479

}

480

481

C[i * lda + j] = spu_sub(C[i * lda + j], temp);

482

}

483

}

484

}

485

486

487

// we ignore errors if it will lead to 0 determinant

488

vector unsigned int vec_spotrf_u(short int N, vector float *A, short int lda) {

489

short int Nleft, Nright;

490

491

vector unsigned int errors;

492

493

//PRINT_MULTIMATRIX(0, "% 7.4f ", ((float*)A), 5, 5, 5);

494

495

if (N > 4) {

496

Nleft = N >> 1;

497

Nright = N - Nleft;

498

499

// printf("passed matrix\n");

500

// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);

501

502

errors = vec_spotrf_u(Nleft, A, lda);

503

504

// printf("left\n");

505

// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);

506

507

vec_spotrf_step(Nleft, Nright, A, lda);

508

// printf("step\n");

509

// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);

510

511

spu_and(vec_spotrf_u(Nright, A + (lda + 1) * Nleft, lda), errors);

512

// printf("right\n");

513

// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);

514

515

}

516

else if (N==4) errors = vec_potrfU_4(A, lda);

517

else if (N==3) errors = vec_potrfU_3(A, lda);

518

else if (N==2) errors = vec_potrfU_2(A, lda);

519

else if (N==1) {

520

errors = spu_cmpgt(*A, zero);

521

*A = spu_sqrt(*A);

522

} else return spu_cmpgt(zero, zero); // anything

523

524

return errors;

525

}

526

527

vector float vec_rcorr(short int N, vector float *C, vector float *Ca, vector float *Cb) {

528

short int i;

529

short int step = N + 1;

530

short int end = N * N;

531

vector float detC, detAB;

532

533

// VECTOR_PRINT_MATRIX(1, "%6.4f", C, 5, 5, 5);

534

535

detC = C[0];

536

detAB = Ca[0] * Cb[0];

537

for (i = step; i < end; i+= step) {

538

detAB = spu_mul(detAB, spu_mul(Ca[i], Cb[i]));

539

detC = spu_mul(detC, C[i]);

540

}

541

542

543

VECTOR_PRINT("detAB", detAB);

544

VECTOR_PRINT("detC", detC);

545

VECTOR_PRINT("detC^2", spu_mul(detC, detC));

546

VECTOR_PRINT("C/AB", spu_div(spu_mul(detC, detC), detAB));

547

VECTOR_PRINT("log", spu_log(spu_div(spu_mul(detC, detC), detAB)));

548

549

550

return spu_mul(two, spu_log(spu_div(spu_mul(detC, detC), detAB)));

551

}

552

553

vector float vec_rmahal(short int N, vector float *C, vector float *X) {

554

555

vector float sum = zero;

556

557

vec_strsv_rlnn(N, C, N, X);

558

for (i = 0; i < N; i++) {

559

vector float val = X[i];

560

sum = spu_madd(val, val, sum);

561

}

562

563

return sum;

564

}

565

566

vector float vec_bhata(vector float rcorr, vector float rmahal) {

567

return spu_add(spu_div(rmahal, eight), spu_div(rcorr, four));

568

}

569

Older »