/ani/mrses : revision 1

To get this branch, use:

bzr branch
http://suren.me/webbzr/ani/mrses

« back to all changes in this revision

Viewing changes to cell/mrses_spe.txt

Committer: Suren A. Chilingaryan
Date: 2010-04-28 04:30:08 UTC
Revision ID: csa@dside.dyndns.org-20100428043008-vd9z0nso9axezvlp

Initial import

files added:

README

VERSION

bmc.m

cell

cell/Makefile

cell/Makefile.in

cell/README

cell/atlas_potrf.c

cell/atlas_potrf.h

cell/buildutils

cell/buildutils/README_build_env.txt

cell/buildutils/cellsdk_select_compiler

cell/buildutils/make.env

cell/buildutils/make.footer

cell/buildutils/make.header

cell/ext

cell/ext/blas.h

cell/ext/cblas.h

cell/ext/lapack.h

cell/ext/lapack_errno.h

cell/hw_sched.c

cell/hw_sched.h

cell/hw_thread.c

cell/hw_thread.h

cell/mrses.h

cell/mrses_hw.c

cell/mrses_impl.c

cell/mrses_impl.h

cell/mrses_ppu.c

cell/mrses_ppu.h

cell/mrses_spe.c

cell/mrses_spe.h

cell/mrses_spe.txt

cell/mrses_spu.c

cell/mrses_spu.h

cell/msg.c

cell/msg.h

cell/ppu

cell/ppu/Makefile

cell/spu

cell/spu/Makefile

cell/test

cell/test.c

cell/test/Makefile

cell/tools.c

cell/tools.h

cell/vec_potrf.c

cell/vec_potrf.h

cell/vec_potrf.pl

cell/vec_potrf_mtxmul.h

libs

libs/ppu

libs/ppu/blas_LINUX.a

libs/ppu/lapack_LINUX.a

libs/ppu/libgslblas.a

libs/ppu/libgslcblas.a

libs/spu

libs/spu.txt

libs/spu/blas_SPE.a

libs/spu/lapack_SPE.a

libs/spu/libgslblas.a

libs/spu/libgslcblas.a

mrses.m

mrses_hw_debug.m

mrses_hw_distance.m

mrses_mtx.m

mrses_orig.m

mrses_software.m

release.sh

scripts

scripts/mrses_install.sh

test.m

test.sh

Show diffs side-by-side

added added

removed removed

cell/mrses_spe.txt

This file contains some ideas which was slower or not comed to the end due

their nature or my inabilities or for time saving purposes.

1. The general idea is to computer full 16x16 matrix until the end. We got

several quadratic matrices along the diagonal with which are the actual

C matrixes and we padding the rest of diagonal with 1. Everything else is 0.

Then we can compute Cholesky and solve equations on all matrices together

using SPE-optimized blas. However, this approach is twice slower compared

to computing matrix by matrix using non-optimized blas.

for (j = 1; j < at_once; j++) {

int offset = j * width * real_width;

MRSESDataType *C_cur = C + offset;

MRSESDataType *Ca_cur = Ca + offset;

MRSESDataType *Cb_cur = Cb + offset;

for (i = 0; i < real_width; i++) {

memset(C_cur + i * width, 0, j * real_width * sizeof(MRSESDataType));

}

for (j = at_once * real_width; j < width; j++) {

memset(C + j * width, 0, width * sizeof(MRSESDataType));

C[j * width + j] = 1;

}

Solving equation can be done with following commands, but I have not managed to

get correct results. Anyway the trsv uses neglectable amount of time. Most

things are done in matrix multiplication and cholesky decomposition.

transpose_matrix(width, width, C, width, D, width);

MRSESDataType *D_cur = D + offset;

solve_upper_1(real_width, D_cur, width, mean_cur);

// multiple of 16

strsv_spu_lower(width, C, width, mean);

2. Another idea was to compute [Ca]*[Cb] as [Ca*Cb]. For that we could

multiply Ca*Cb and then multiply by transpose of it (Ca*Cb)' to get

symmetric matrix. Finally we should get the square of determinat.

But, the resulting symmetric matrix is not positive-definite (or for

precision reasons), the cholesky decomposition have failed for it.

for (i = 0; i < width; i++) {

for (j = 0; j <= i; j++) {

Ca[j * width + i] = Ca[i * width + j];

Cb[j * width + i] = Cb[i * width + j];

}

memset(Cab, 0, width*width*sizeof(MRSESDataType));

//sgemm_spu(width, width, width, Ca, Cb, Cab);

cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, width, width, width, 1, Ca, width, Cb, width, 0, Cab, width);

memset(Cab2, 0, width*width*sizeof(MRSESDataType));

blas_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width);

//spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width);

for ...

lapack_potrf(&hmode, &real_width, Cab_cur, &width, err);

detC = C_cur[0]; detAB = Cab_cur[0];

for (i = width + 1; i < c_step; i+= (width+1)) {

detAB *= Cab_cur[i];

detC *= C_cur[i];

}

rcorr = logf(detC * detC * detC * detC / detAB);

}

3. Following code is matrix multiplocation using default (non-SPE) blas. Even

if used with appropriate small matrices it is significantly slower than IBM

code on full matrix.

char hmode = 'L';

//original (do not store C)

//blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width);

//blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width);

blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 50, 1, A, nA, 0, Ca, width);

blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 40, 1, B, nB, 0, Cb, width);

memcpy(C, Ca, width2 * sizeof(MRSESDataType));

blas_axpy(width2, 1, Cb, 1, C, 1);

blas_scal(width2, 0.5, C, 1);

4. Other ways to solve equation system. Both works but a bit slower

// Faster 82:

blas_trsm(

CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit,

1, real_width, 1, C_cur, width, mean_cur, width //1?

);

// Slower 83-84

blas_trsm(

100

CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit,

101

real_width, 1, 1, C_cur, width, mean_cur, 1 //width //1?

102

);

103

104

105

5. Other ways to perform cholesky decomposition. The lapack version is surely

106

working. Curretnly implemented seems to work, but who nows about special cases.

107

108

atlas_spotrf2(real_width, C_cur, width);

109

atlas_spotrf2(real_width, Ca_cur, width);

110

atlas_spotrf2(real_width, Cb_cur, width);

111

112

lapack_potrf(&hmode, &real_width, C_cur, &width, &err);

113

if (err) return 1;

114

lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err);

115

if (err) return 1;

116

lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err);

117

if (err) return 1;

118

119

120

6. This is fully working, but not vectorized version of code

121

static inline int mrses_spe_real_run(

122

MRSESContext mrses, MRSESDataType *result,

123

int width, int width2, int nA, int nB,

124

MRSESDataType *A, MRSESDataType *B, MRSESDataType *mean,

125

MRSESDataType *C, MRSESDataType *Ca, MRSESDataType *Cb

126

) {

127

int i, j, err;

128

129

MRSESDataType detAB, detC;

130

MRSESDataType rmahal, rcorr;

131

132

short int walloc;

133

134

char hmode = 'U';

135

136

int real_width = 5;

137

// int iterate_size = mrses->iterate_size;

138

int at_once = max(1, SPE_BLOCK / real_width);

139

// int times = iterate_size / at_once;

140

141

int rww = at_once * width * real_width;

142

int rww_alloc = calc_alloc(rww, 64);

143

144

memset(Ca, 0, rww * sizeof(MRSESDataType));

145

memset(Cb, 0, rww * sizeof(MRSESDataType));

146

147

//PRINT_MATRIX("% 6.4f ", A, 16, 16, 16)

148

149

// that works on multiples of 16

150

spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width);

151

spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width);

152

153

// thats works on multiples of 64 (16x16 - fine)

154

memcpy(C, Ca, rww * sizeof(MRSESDataType));

155

spe_axpy(rww_alloc, 1, Cb, 1, C, 1);

156

spe_scal(rww_alloc, 0.5, C, 1);

157

158

int c_step = real_width * (width + 1);

159

160

for (j = 0; j < at_once; j++) {

161

int offset = j * c_step;

162

MRSESDataType *C_cur = C + offset;

163

MRSESDataType *Ca_cur = Ca + offset;

164

MRSESDataType *Cb_cur = Cb + offset;

165

MRSESDataType *mean_cur = mean + j * real_width;

166

MRSESDataType result;

167

168

169

err = atlas_spotrf_u(real_width, C_cur, width);

170

if (err) return 1;

171

err = atlas_spotrf_u(real_width, Ca_cur, width);

172

if (err) return 1;

173

err = atlas_spotrf_u(real_width, Cb_cur, width);

174

if (err) return 1;

175

176

177

178

atlas_spotrf2(real_width, C_cur, width);

179

atlas_spotrf2(real_width, Ca_cur, width);

180

atlas_spotrf2(real_width, Cb_cur, width);

181

182

lapack_potrf(&hmode, &real_width, C_cur, &width, &err);

183

if (err) return 1;

184

lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err);

185

if (err) return 1;

186

lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err);

187

if (err) return 1;

188

189

190

detC = C_cur[0]; detAB = Ca_cur[0] * Cb_cur[0];

191

for (i = width + 1; i < c_step; i+= (width+1)) {

192

detAB *= (Ca_cur[i] * Cb_cur[i]);

193

detC *= C_cur[i];

194

}

195

196

rcorr = 2 * logf(detC * detC / detAB);

197

198

199

blas_trsv(

200

CblasRowMajor, CblasLower, CblasNoTrans, CblasNonUnit,

201

real_width, C_cur, width, mean_cur, 1

202

);

203

204

rmahal = blas_dot(real_width, mean_cur, 1, mean_cur, 1);

205

206

switch (mrses->dist) {

207

case BHATTACHARYYA:

208

result = rmahal/8 + rcorr/4;

209

break;

210

case MAHALANOBIS:

211

result = rmahal;

212

break;

213

case CORCOR:

214

result = rcorr;

215

break;

216

default:

217

result = 0;

218

}

219

220

// printf("SPU result: %e (mahal: %e, corcor: %e)\n", result, rmahal, rcorr);

221

}

222

223

return 0;

224

}

225

226

data->A = (MRSESDataType*)(allocation + pos);

227

pos += walloc * alloc * sizeof(MRSESDataType);

228

229

data->B = (MRSESDataType*)(allocation + pos);

230

pos += walloc * alloc * sizeof(MRSESDataType);

231

232

data->C = (MRSESDataType*)(allocation + pos);

233

pos += walloc2 * sizeof(MRSESDataType);

234

235

data->Ca = (MRSESDataType*)(allocation + pos);

236

pos += walloc2 * sizeof(MRSESDataType);

237

238

data->Cb = (MRSESDataType*)(allocation + pos);

239

pos += walloc2 * sizeof(MRSESDataType);

240

241

// memset(data->A + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType));

242

// memset(data->B + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType));

243

// memset(data->C + used_walloc * walloc, 0, (walloc - used_walloc) * walloc * sizeof(MRSESDataType));

244

245

// memset(data->mean + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType));

246

// memset(data->mean_copy + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType));

247

248

249

250

err = mrses_spe_real_run(

251

mrses, cur_result,

252

walign, walign2, aA, aB, //width, width2, nA, nB, alloc,

253

A, B, mean_copy, C, Ca, Cb

254

);

255

256

257

258

err = mrses_spe_real_run(

259

mrses, result,

260

walign, walign2, aA, aB, //width, width2, nA, nB, alloc,

261

A, B, mean_copy, C, Ca, Cb

262

);

263

264

265

266

7. Vectorization tests with various invalid operations

267

268

vector float one = {1, -1, 1, 1};

269

vector float some_zero = {0, 1, 1, 0};

270

vector float zero = {0, 0, 0, 0};

271

272

vector float result;

273

result = divf4(one, some_zero);

274

275

printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));

276

printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));

277

278

result = sqrtf4(result);

279

280

printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));

281

printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));

282

283

result = divf4_fast(one, some_zero);

284

285

printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));

286

printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));

287

288

result = sqrtf4_fast(fmaxf4(result, zero));

289

290

printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));

291

printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));

292

293

return 0;

294

295

296

297

8. Debugging vectorized code

298

299

int err = atlas_spotrf2(real_width, Ca, width);

300

if (err) printf("Problem with spotrf\n");

301

else {

302

if ((((unsigned int)D)%16)==0) {

303

printf("Cholesky Ca\n");

304

PRINT_MATRIX("% 6.4f ", Ca, 16, 5, 5)

305

}

306

}

307

308

309

float C[16*16];

310

memset(C, 0, 16*16 * sizeof(MRSESDataType));

311

memcpy(C, Ca, rww * sizeof(MRSESDataType));

312

spe_axpy(16*16, 1, Cb, 1, C, 1);

313

spe_scal(16*16, 0.5, C, 1);

314

315

316

puts("=====================================");

317

int c_step = real_width * (width + 1);

318

atlas_spotrf2(real_width, Ca, width);

319

atlas_spotrf2(real_width, Cb, width);

320

atlas_spotrf2(real_width, C, width);

321

float detC = C[0]; float detAB = Ca[0] * Cb[0];

322

for (i = width + 1; i < c_step; i+= (width+1)) {

323

detAB *= (Ca[i] * Cb[i]);

324

detC *= C[i];

325

}

326

327

float rcorr = 2 * logf(detC * detC / detAB);

328

printf("Real det: %e = %e %e\n", rcorr, detC, detAB);

329

330

331

9. Non-vectorized D-recovery

332

static inline int mrses_spe_real_multiply_recover_old(

333

MRSESDataType *D,

334

short int pack, short int at_once,

335

short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,

336

MRSESDataType *A, MRSESDataType *B,

337

MRSESDataType *Ca, MRSESDataType *Cb,

338

MRSESIntType *drp_gen, MRSESIntType *rst_gen,

339

MRSESDataType *Ea, MRSESDataType *Eb

340

341

) {

342

short int i, j, k, l;

343

344

345

if ((((int)D)%16)==0) {

346

if (*rst_gen > width) {

347

printf("Orig\n");

348

PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);

349

}

350

}

351

if (*rst_gen > width) {

352

vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);

353

}

354

if ((((int)D)%16)==0) {

355

if (*rst_gen > width) {

356

printf("Update: %i\n", *drp_gen);

357

PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);

358

}}

359

360

if (*rst_gen > width) {

361

vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);

362

}

363

364

/* if (*rst_gen > width) {

365

vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);

366

vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);

367

}*/

368

369

short int c_step = width * (walloc + 1);

370

for (i = 0; i < at_once; ++i) {

371

short int e_offset = i * walloc;

372

short int c_offset = i * c_step;

373

374

short int gen = drp_gen[i];

375

376

MRSESDataType *Ea_cur = Ea + e_offset;

377

MRSESDataType *Eb_cur = Eb + e_offset;

378

379

MRSESDataType *Ca_cur = Ca + c_offset;

380

MRSESDataType *Cb_cur = Cb + c_offset;

381

382

MRSESDataType *Da = D + (3 * i + 1) * d_step;

383

MRSESDataType *Db = Da + d_step;

384

385

// PRINT_MATRIX("% 6f ", Ea_cur, 0, 1, 5);

386

387

if (rst_gen[i] > width) {

388

for (j = 0; j < width; j++) {

389

if (j < gen) {

390

l = j; k = gen;

391

} else {

392

l = gen; k = j;

393

}

394

395

// if (fabs(Ca[k*walloc+l] - Ea_cur[j])>0.001) {

396

// printf("Restoring (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Ca_cur[k*walloc + l], Ea_cur[j]);

397

// }

398

Ca_cur[k * walloc + l] = Ea_cur[j];

399

Cb_cur[k * walloc + l] = Eb_cur[j];

400

}

401

402

}

403

404

// Recovering to current stage

405

for (k = 0; k < width; ++k) {

406

for (l = 0; l < width; ++l) {

407

Da[pack*(k * width + l)] = Ca_cur[k * walloc + l];

408

Db[pack*(k * width + l)] = Cb_cur[k * walloc + l];

409

}

410

}

411

}

412

413

414

float Ta[16*16];

415

vec_ssyrk_rln_11(ralloc, nA, A, nA, Ta, walloc);

416

int prob = 0;

417

for (i = 0; i < 5; i++) {

418

for (j = 0; j < 5; j++) {

419

if (fabs(Ta[i*walloc+j] - Ca[i*walloc+j])>0.00001) {

420

prob = 1;

421

break;

422

}

423

}

424

}

425

if (prob) {

426

printf("Restoring: %i, gen: %i\n", *rst_gen, *drp_gen);

427

printf("Computed\n");

428

PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);

429

printf("Should be:\n");

430

PRINT_MATRIX("% 6.4f", Ta, walloc, 5, 5);

431

exit(1);

432

} else {

433

// printf("pass\n");

434

}

435

436

// vec_ssyrk_rln_11(ralloc, nB, B, nB, Tb, walloc);

437

438

439

return 0;

440

}

441