001: /*

002:  * Copyright (C) 2002 Jeffrey R. Hoy <jrh26@cornell.edu>

003:  * 

004:  * This program is free software; you can redistribute it and/or

005:  * modify it under the terms of the GNU General Public License

006:  * as published by the Free Software Foundation; either version 2

007:  * of the License, or (at your option) any later version.

008:  * 

009:  * This program is distributed in the hope that it will be useful,

010:  * but WITHOUT ANY WARRANTY; without even the implied warranty of

011:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

012:  * GNU General Public License for more details.

013:  * 

014:  * You should have received a copy of the GNU General Public License

015:  * along with this program; if not, write to the Free Software

016:  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

017:  */

018: 

019: 

020: 

021: #include <stdio.h>

022: #include <fstream.h>

023: #include <stdlib.h>

024: #include <string.h>

025: 

026: #include <math.h>

027: #include <time.h>

028: 

029: // Parameters

030: typedef /*unsigned __int64*/ int DWORD64; // change this to use 64-bit values

031: const double percentReads = 0.95; // amount of data to cover

032: int cutoff = 50; // max sequence length

033: const int numFiles = 500; // number of files to build at once

034: #define BUFF_SIZE 1500000 // DWORD64s

035: #define HASH_SIZE 1048576 // integers

036: #define LISTBLOCK_SIZE 1048576 // integers

037: const int writeBuffSize = 20000;

038: #define MAX_ASTERIK 0

039: #define ASTERIK 99999999 // special number to mark '*'

040: 

041: // Global variables

042: int minSupport;

043: int minSupportTotal;

044: int* globalBlock;  // final stat information

045: int blockSize = 0;

046: int blockAlloc = 0;

047: int blockMax = 10000; // initial num lines in global stat block

048: const int lineLength = cutoff + 2; // cutoff for sequence, one for count, one for length

049: int globalSequenceLength; // used for binary search

050: #define SYNTAX "Syntax:\n  mine.exe datafile outfile minSupportIndividual minSupportTotal "

051: 

052: int checkAndMerge(int* line1, int* line2, int len);

053: void addBlockData(int length, int count, int* label);

054: 

055: 

056: class Sequence {

057: 

058: private:

059: 

060:         int sequenceLength;

061:         int numSequences;

062:         DWORD64* data;

063:         char* fileName;

064: 

065: 

066: public:

067: 

068:         Sequence(char* fileNameIn); // constructor for length 1

069:         Sequence(Sequence* seqLenN, Sequence* seqLen1, char* fileNameIn); // constructor for length >1

070:         ~Sequence() {

071:                 if (numSequences > 1 && data != NULL)

072:                         delete [] data;

073:         }

074: 

075:         int getNumSequences() {

076:                 return numSequences;

077:         }

078: 

079:         

080:         int getSequenceLength() {

081:                 return sequenceLength;

082:         }

083: 

084:         const DWORD64* getData() {

085:                 return data;

086:         }

087: 

088:         void addToBlock();

089: 

090: };

091: 

092: 

093: 

094: 

095: // Comparison for size 1 sorting, smaller first

096: int compare1(const void* a, const void* b) {

097:         if (*(DWORD64*)a > *(DWORD64*)b)

098:                 return 1;

099:         else if (*(DWORD64*)a == *(DWORD64*)b)

100:                 return 0;

101:         else

102:                 return -1;

103: }

104: 

105: 

106: 

107: // Comparison for global block sorting, larger first

108: int compare2(const void* a, const void* b) {

109:         return ((int*)b)[cutoff] - ((int*)a)[cutoff]; // switch (ret positive) if b is larger

110: }

111: 

112: 

113: // Comparison for finding most significant loads, larger first

114: int compare3(const void* a, const void* b) {

115:         if (*(DWORD64*)a < *(DWORD64*)b)

116:                 return 1;

117:         else if (*(DWORD64*)a == *(DWORD64*)b)

118:                 return 0;

119:         else

120:                 return -1;

121: }

122: 

123: // For binary search on sorted sequences

124: int search1(const void* a, const void* b) {

125:         int i;

126:         for (i = 0; i < globalSequenceLength; i++) {

127:                 if ( ((DWORD64*)a)[i] < ((DWORD64*)b)[i] )

128:                         return -1;

129:                 else if ( ((DWORD64*)a)[i] > ((DWORD64*)b)[i] )

130:                         return 1;

131:         }

132: 

133:         return 0;

134: }

135: 

136: 

137: 

138: 

139: // Builds all sequences of length 1

140: Sequence :: Sequence(char* fileNameIn) {

141:         int index;

142:         int intsRead;

143:         int listIndex = 0;

144:         int numValues = 0;

145:         int check;

146: 

147:         sequenceLength = 1;

148:         fileName = fileNameIn;

149: 

150:         // probabilistic counting:

151:         //   hash everthing into large buckets,

152:         //   mark buckets that have more than minimum support

153:         //   hash again, make list of address that might have min support and counts

154:         //   sort and pack

155: 

156:         DWORD64* hashBlock = new DWORD64 [HASH_SIZE];

157:         if (hashBlock == NULL) { cout << "Insufficient Memory for hashBlock"; return; }

158:         memset(hashBlock, 0, HASH_SIZE*sizeof(DWORD64));

159: 

160:         ifstream* traceFile = new ifstream(fileName, ios::in | ios::binary);

161:         if (!traceFile) { cout << "Unable to open trace file" << endl; return; }

162: 

163:         DWORD64* readBuff = new DWORD64 [BUFF_SIZE];

164:         if (readBuff == NULL) { cout << "Insufficient Memory for readBuff" << endl; return; }

165: 

166:         while (!traceFile->eof()) {

167:                 traceFile->read((char*)readBuff, BUFF_SIZE*sizeof(DWORD64));

168: 

169:                 intsRead = traceFile->gcount() / sizeof(DWORD64);

170: 

171:                 for (index = 0; index < intsRead; index++) {

172:                         if (readBuff[index] % HASH_SIZE > HASH_SIZE || readBuff[index] % HASH_SIZE < 0) {

173:                                 cout << "Problem with mod and hashing the data:" << endl;

174:                                 cout << *(int*)&readBuff[index] << " " << *(((int*)&readBuff[index])+1)<< endl;

175:                                 cout << index << endl;

176:                         } else

177:                         hashBlock[readBuff[index] % HASH_SIZE]++;

178:                 }

179:         }

180:         traceFile->close();

181: 

182:         traceFile->open(fileName, ios::in | ios::binary);

183:         if (!traceFile) { cout << "Unable to open trace file" << endl; return; }

184: 

185:         DWORD64* listBlock = new DWORD64 [LISTBLOCK_SIZE*2];

186:         if (listBlock == NULL) { cout << "Insufficient Memory for listBlock, change LISTBLOCK_SIZE" << endl; return; }

187:         memset(listBlock, 0, LISTBLOCK_SIZE*2*sizeof(DWORD64));

188: 

189:         while (!traceFile->eof()) {

190:                 traceFile->read((char*)readBuff, BUFF_SIZE*sizeof(DWORD64));

191: 

192:                 intsRead = traceFile->gcount() / sizeof(DWORD64);

193: 

194:                 for (index = 0; index < intsRead; index++) {

195:                         if (hashBlock[readBuff[index] % HASH_SIZE] >= minSupport) {

196:                                 int found = 0;

197:                                 for (check = 0; check < listIndex; check++) {

198:                                         if (listBlock[check*2] == readBuff[index]) {

199:                                                 found = 1;

200:                                                 listBlock[check*2+1]++;

201:                                         }

202:                                 }

203:                                 if (!found) {

204:                                         listBlock[listIndex*2] = readBuff[index];

205:                                         listBlock[listIndex*2+1]++;

206:                                         listIndex++;

207:                                         if (listIndex == 3000) {

208:                                                 cout << "3000 sequences length 1 thus far, this might get slow" << endl;

209:                                         }

210:                                         if (listIndex >= LISTBLOCK_SIZE) {

211:                                                 cout << "Need more memory for listBlock... Program exit" << endl;

212:                                         }

213:                                 }

214: 

215:                         }

216:                 }

217:         }

218:         traceFile->close();

219:         delete [] readBuff;

220: 

221: 

222:         // pack it up

223:         numSequences = 0;

224:         for (index = 0; index < listIndex; index++) 

225:                 if (listBlock[index*2+1] >= minSupport) 

226:                         numSequences++;

227:         data = new DWORD64[numSequences*2*sizeof(DWORD64)];

228:         if (data == NULL) { cout << "Insufficient Memory for data" << endl; return; }

229:         memset(data, 0, numSequences*2*sizeof(DWORD64));

230: 

231:         // pack it up

232:         numSequences = 0;

233:         for (index = 0; index < listIndex; index++) {

234:                 if (listBlock[index*2+1] >= minSupport) {

235:                         data[numSequences*2] = listBlock[index*2];

236:                         data[numSequences*2+1] = listBlock[index*2+1];

237:                         numSequences++;

238:                 }

239:         }

240: 

241: #if MAX_ASTERIK > 0

242:         // Added code for wildcard

243:         data[numSequences*2] = ASTERIK;

244:         numSequences++;

245: #endif

246: 

247:         // sort in increasing order

248:         qsort(data, numSequences, sizeof(DWORD64)*2, compare1);

249: 

250: cout << "Num potential sequences len1 is " << numSequences << endl;

251: 

252:         delete [] hashBlock;

253:         delete [] listBlock;

254:         delete traceFile;

255: }

256: 

257: 

258: 

259: 

260: 

261: 

262: 

263: // Builds all sequences of length n+1

264: Sequence :: Sequence(Sequence* seqLenN, Sequence* seqLen1, char* fileNameIn) {

265: 

266:         int i,j,k,h;

267: 

268:         int lenSeqN = seqLenN->getSequenceLength();

269:         int numSeqN = seqLenN->getNumSequences();

270:         int numSeq = 0;

271:         const DWORD64* seqLenNData = seqLenN->getData();

272: 

273:         sequenceLength = lenSeqN+1;

274:         fileName = fileNameIn;

275: 

276:         // build larger sequences

277:         // just get count this time

278:         numSeq = 0;

279:         for (i = 0; i < numSeqN; i++) {

280:                 for (j = 0; j < numSeqN; j++) {

281:                         if (memcmp(&seqLenNData[i*(lenSeqN+1)+1], &seqLenNData[j*(lenSeqN+1)], (lenSeqN-1)*sizeof(DWORD64)) == 0) { // inner match found

282:                                 numSeq++;

283:                         }

284:                 }

285:         }

286: 

287:         DWORD64* tempData = new DWORD64[numSeq * (sequenceLength+1)];

288:         if (tempData == NULL) { cout << "Insufficient Memory2" << endl; exit(1); }

289:         memset(tempData, 0, sizeof(DWORD64)*numSeq*(sequenceLength+1));

290: 

291:         // now actually build

292:         numSeq = 0;

293:         for (i = 0; i < numSeqN; i++) {

294:                 for (j = 0; j < numSeqN; j++) {

295:                         if (memcmp(&seqLenNData[i*(lenSeqN+1)+1], &seqLenNData[j*(lenSeqN+1)], (lenSeqN-1)*sizeof(DWORD64)) == 0) { // inner match found

296: 

297:                                 memcpy(&tempData[numSeq*(sequenceLength+1)], &seqLenNData[i*(lenSeqN+1)], lenSeqN*sizeof(DWORD64));

298:                                 tempData[numSeq*(sequenceLength+1) + (sequenceLength-1)] = seqLenNData[j*(lenSeqN+1)+(lenSeqN-1)];

299: 

300:                                 int astcount = 0;

301:                                 for (k = 0; k < sequenceLength; k++) {

302:                                         if (tempData[numSeq*(sequenceLength+1)+k] == ASTERIK)

303:                                                 astcount++;

304:                                 }

305: 

306:                                 if (astcount <= MAX_ASTERIK) // accept new sequence

307:                                         numSeq++;

308:                         }

309:                 }

310:         }

311: 

312:         cout << "Num potential sequences length "<<sequenceLength<<" is " << numSeq << endl;

313: 

314:         ifstream* traceFile = new ifstream(fileName, ios::in | ios::binary);

315:         if (traceFile->fail()) { cout << "Unable to open trace file, program exit" << endl;        return;        }

316: 

317:         int intsRead = 0;

318: 

319:         DWORD64* buff = new DWORD64 [BUFF_SIZE];

320:         if (buff == NULL) { cout << "Insufficient Memory for readBuff" << endl;        return;        }

321: 

322:         while (!traceFile->eof()) {

323:                 traceFile->read((char*)(&buff[intsRead]), (BUFF_SIZE-intsRead)*sizeof(DWORD64));

324:                 intsRead += traceFile->gcount() / sizeof(DWORD64);

325: 

326:                 // new optimized code to handle wildcard searches

327:                 globalSequenceLength = sequenceLength; // so search knows number of elements

328:                 DWORD64* place = NULL;

329:                 DWORD64 temp1 = 0, temp2 = 0;

330: 

331:                 for (h = 0; h < intsRead-sequenceLength+1; h++) {

332: 

333:                         // using #'s because this is the hotspot

334:                         // This could be rewitten so handle any number of asteriks, but it would be more complex code

335:                         // The search space is too large for more than 2 asteriks, so it shouldn't matter

336:                         #if MAX_ASTERIK >= 3

337:                                 cout << "Only handling 2 asteriks for now" << endl;

338:                         #endif

339: 

340:                         #if MAX_ASTERIK >= 2

341:                                 for (i = 0; i < sequenceLength-1; i++) {

342:                                         temp1 = buff[h + i];

343:                                         buff[h + i] = ASTERIK;

344:                                         for (j = i+1; j < sequenceLength; j++) {

345:                                                 temp2 = buff[h + j];

346:                                                 buff[h + j] = ASTERIK;

347: 

348:                                                 place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1);

349:                                                 if (place != NULL) // found

350:                                                         place[sequenceLength]++;

351: 

352:                                                 buff[h + j] = temp2;

353: 

354:                                         }

355:                                         buff[h + i] = temp1;

356:                                 }

357:                         #endif

358: 

359:                         #if MAX_ASTERIK >= 1

360:                                 //change so it only looks up if it misses the others?

361:                                 for (i = 0; i < sequenceLength; i++) {

362:                                         temp1 = buff[h + i];

363:                                         buff[h + i] = ASTERIK;

364: 

365:                                         place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1);

366:                                         if (place != NULL) // found

367:                                                 place[sequenceLength]++;

368: 

369:                                         buff[h + i] = temp1;

370:                                 }

371:                         #endif

372: 

373:                         // also handle original search

374:                         place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1);

375:                         if (place != NULL) // found

376:                                 place[sequenceLength]++;

377:                 }

378: 

379:                 int intsToCopy = sequenceLength-1;  // need to overlap the buffer

380:                 memcpy(&buff[0], &buff[intsRead-intsToCopy], (intsToCopy)*sizeof(DWORD64));

381:                 intsRead = intsToCopy;

382: 

383:         }

384: 

385:         traceFile->close();

386: 

387: 

388: 

389:         // pack the data by removing unsupported sequences

390:         numSequences = 0;

391:         for (i = 0; i < numSeq; i++) {

392:                 if (tempData[i*(sequenceLength+1) + sequenceLength] >= minSupport)

393:                         numSequences++;

394:         }

395: 

396:         data = new DWORD64 [numSequences * (sequenceLength+1)];

397:         if (data == NULL) { cout << "Insufficient Memory3" << endl;        return;        }

398: 

399:         numSequences = 0;

400:         for (i = 0; i < numSeq; i++) {

401:                 if (tempData[i*(sequenceLength+1) + sequenceLength] >= minSupport) {

402:                         memcpy(&data[numSequences * (sequenceLength+1)], &tempData[i * (sequenceLength+1)], (sequenceLength+1)*sizeof(DWORD64));

403:                         numSequences++;

404:                 }

405:         }

406: 

407:         delete [] buff; 

408:         delete [] tempData;

409:         delete traceFile;

410: 

411: }

412: 

413: 

414: 

415: 

416: void Sequence :: addToBlock() {

417:         int i,j,k;

418:         int found;

419:         int label[200] = {0};

420:         int labelCount = 0;

421: 

422:         for (i = 0; i < numSequences; i++) {

423:                 

424:                 // get labels for block line

425:                 labelCount = 0;

426:                 for (j = 0; j < sequenceLength; j++) {

427:                         found = 0;

428:                         for (k = 0; k < j; k++) {

429:                                 if (data[i*(sequenceLength+1) + j] == data[i*(sequenceLength+1) + k]) { // repeated element

430:                                         found = 1;

431:                                         label[j] = label[k];

432:                                         break;

433:                                 }

434:                         }

435:                         if (!found) {

436:                                 if (data[i*(sequenceLength+1) + j] == ASTERIK) {

437:                                         label[j] = ASTERIK;

438:                                 } else {

439:                                         labelCount++;

440:                                         label[j] = labelCount;

441:                                 }

442:                         }

443:                 }

444:                 addBlockData(sequenceLength, (int)data[i*(sequenceLength+1) + sequenceLength], label);

445:         }

446: 

447: }

448: 

449: 

450: 

451: 

452: 

453: 

454: 

455: 

456: 

457: 

458: // global block update

459: // Global block structure:

460: // blockMax lines each of length linelength

461: //  each line has 0 to cutoff labels, count, and length

462: //

463: 

464: void addBlockData(int length, int count, int* label) {

465: 

466:         int i;

467: 

468:         // search for line

469:         // could actually hash this if it is a bottleneck

470:         for (i = 0; i < blockSize; i++) {

471:                 if (memcmp(&globalBlock[i*lineLength], label, cutoff*sizeof(int)) == 0) {

472:                         globalBlock[i*lineLength + cutoff] += count;

473:                         return;

474:                 }

475:         }

476: 

477:         // not found

478:         memcpy(&globalBlock[blockSize*lineLength], label, cutoff*sizeof(int));

479:         globalBlock[blockSize*lineLength + cutoff] += count;

480:         globalBlock[blockSize*lineLength + cutoff + 1] = length;

481: 

482:         blockSize++;

483: 

484:         // check if will need more memory for the block

485:         if (blockSize*lineLength >= blockAlloc) {

486:                 cout << "Global info block full, reallocating..." << endl;

487: 

488:                 blockAlloc = blockAlloc * 4;

489:                 int* globalBlockNew = new int [blockAlloc];

490:                 if (globalBlockNew == NULL) { cout << "Insufficient memory for global information block, program exit" << endl; exit(1); }

491:                 cout << "Done reallocating1..." << endl;

492:                 memset(globalBlockNew, 0, blockAlloc*sizeof(int));

493:                 cout << "Done reallocating2..." << endl;

494: 

495:                 memcpy(globalBlockNew, globalBlock, blockSize*lineLength*sizeof(int));

496:                 cout << "Done reallocating3..." << endl;

497:                 delete [] globalBlock ;

498:                 globalBlock = globalBlockNew;

499: 

500:                 cout << "Done reallocating..." << endl;

501: 

502:         }

503: }

504: 

505: 

506: 

507: 

508: // Normalize pattern by starting on 'A'

509: void flatten(int* a, int len) {

510: 

511:         int i;

512:         int newcount = 1;

513:         int* change = new int[cutoff];

514: 

515:         for (i = 0; i < len; i++)

516:                 change[i] = -1;

517: 

518:         for (i = 0; i < len; i++) {

519:                 if (a[i] != ASTERIK) if (change[a[i]] == -1) {

520:                         change[a[i]] = newcount;

521:                         newcount++;

522:                 }

523:         }

524: 

525:         for (i = 0; i < len; i++) if (a[i] != ASTERIK) {

526:                 a[i] = change[a[i]];

527:         }

528: 

529:         delete [] change;

530: 

531: }

532: 

533: 

534: 

535: // See if pattern is same thing but shifted, if so merge

536: int checkAndMerge(int* line1, int* line2, int len) {

537: 

538:         int i,j;

539:         int found = 0;

540: 

541:         int* a = new int[len];

542:         int* b = new int[len];

543: 

544: 

545:         for (j = 0; j < len; j++)

546:                 b[j] = line2[j];

547: 

548:         for (i = 0; i < len; i++) {

549:                 for (j = 0; j < len; j++) {

550:                         a[j] = line1[(i+j) % len];

551:                 }

552:                 flatten(a, len);

553: 

554:                 found = 1;

555:                 for (j = 0; j < len; j++)

556:                         if (a[j] != b[j])

557:                                 found = 0;

558: 

559:                 if (found) { // take max times appearing

560:                         if (line1[cutoff] > line2[cutoff])

561:                                 line2[cutoff] = 0;

562:                         else

563:                                 line1[cutoff] = 0;

564:                         break;

565:                 }

566:         }

567: 

568:         delete [] a;

569:         delete [] b;

570: 

571:         return found;

572: 

573: }

574: 

575: 

576: 

577: // Formats results to outfile

578: void printBlockData(char* outFileName, char* dataFileName) {

579:         

580:         cout << "Global block size is " << blockSize << endl;

581: 

582:         int i,j,k;

583:         int templine[1000] = {0};

584: 

585:         // combine patterns that are really similiar, like AAAB and AABA and ABAA and ABBB

586:         for (i = 1; i <= cutoff; i++) {

587:                 for (j = 0; j < blockSize-1; j++) {

588:                         if (globalBlock[j*lineLength + cutoff + 1] == i && globalBlock[j*lineLength + cutoff] > 0)

589:                                 for (k = j+1; k < blockSize; k++) {

590:                                         // check that length is good

591:                                         if (globalBlock[k*lineLength + cutoff + 1] == i && globalBlock[k*lineLength + cutoff] > 0)

592:                                                 checkAndMerge(&globalBlock[j*lineLength], &globalBlock[k*lineLength], i);

593:                         }

594:                 }

595:         }

596: 

597:         // sorts the whole thing, ignoring length at this point

598:         qsort(globalBlock, blockSize, lineLength*sizeof(int), compare2);

599: 

600: 

601:         ofstream outFile;

602:         outFile.open(outFileName, ios::out | ios::trunc);

603:         if (outFile.fail()) {

604:                 cout << "Can't open output file: " << outFileName << endl;

605:                 return;

606:         }

607: 

608:         outFile << "Data File Name: " << dataFileName << endl;

609:         outFile << "Minimum support for each load: " << minSupport << endl;

610:         outFile << "Minimum support over all loads: " << minSupportTotal << endl;

611: 

612:         for (i = 1; i <= cutoff; i++) {

613:                 outFile << "Sequences Length " << i << endl;

614:                 for (j = 0; j < blockSize; j++) {

615:                         if (globalBlock[j*lineLength + cutoff + 1] == i && globalBlock[j*lineLength + cutoff] >= minSupportTotal) {

616:                                 outFile << "  Count: " << globalBlock[j*lineLength + cutoff] << " Sequence:";

617:                                 for (k = 0; k < i; k++) {

618:                                         if (globalBlock[j*lineLength + k] == ASTERIK)

619:                                                 outFile << " " << '*';

620:                                         else

621:                                                 outFile << " " << (char)((globalBlock[j*lineLength + k]-1) + 'A');

622:                                 }

623:                                 outFile << endl;

624:                         }

625:                 }

626:         }

627: 

628: 

629:         outFile.close();

630: 

631: }

632: 

633: 

634: 

635: 

636: // Analyzes a single load file, building sequences from 1 to cutoff

637: void Start(char* fileName) {

638: 

639:         Sequence* seqLenN = new Sequence(fileName);

640:         seqLenN->addToBlock();

641: 

642:         while (seqLenN->getNumSequences() > 0 && seqLenN->getSequenceLength() < cutoff) {

643:                 Sequence* seqLenNPlus1 = new Sequence(seqLenN, seqLenN, fileName);

644: 

645:                 delete seqLenN;

646:                 seqLenN = seqLenNPlus1;

647:                 seqLenN->addToBlock();

648:         }

649:         delete seqLenN;

650: 

651: }

652: 

653: 

654: 

655: 

656: 

657: 

658: 

659: void separateLoads(char* dataFileName, char* outFileName) {

660: 

661:         cout << "Execution Beginning" << endl;

662:         cout << "Max Sequence Length: " << cutoff << endl;

663: 

664:         int totalLoads = 0;

665:         int temp = 0;

666:         DWORD64 totalReads = 0;

667:         DWORD64 countReads = 0;

668: 

669:         int readsUsed = -1;

670:         int i;

671: 

672:         ifstream* traceFile = new ifstream(dataFileName, ios::in | ios::binary);

673:         if (!traceFile) {

674:                 cout << "Unable to open trace file, program exit" << endl;

675:                 return;

676:         }

677: 

678:         //

679:         // Obtain numbers of most significant loads

680:         // Variable loadGroup has an int for each load.  It stores the new file number if the

681:         //  load is significant.  Otherwise it stores 0.

682:         //

683:         

684:         traceFile->read((char*)&totalLoads, sizeof(int));

685:         cout << (int)totalLoads << " Total Loads" << endl;

686: 

687:         DWORD64* loadNums = new DWORD64[(int)totalLoads*2];

688:         if (loadNums == NULL) cout << "Not enough memory for loadNums" << endl;

689:         memset(loadNums, 0, sizeof(DWORD64)*(int)totalLoads*2);

690: 

691:         int* readBuff = new int [BUFF_SIZE*2];

692:         if (readBuff == NULL) cout << "Not enough memory for readBuff" << endl;

693:         while (!traceFile->eof()) {

694:                 traceFile->read((char*)readBuff, BUFF_SIZE*2*sizeof(int));

695: 

696:                 int valsRead = traceFile->gcount() / sizeof(DWORD64);

697: 

698:                 for (i = 0; i < valsRead; i+=2)

699:                         loadNums[readBuff[i]*2]++;

700:         }

701:         traceFile->close();

702: 

703:         for (i = 0; i < totalLoads; i++) {

704:                 loadNums[i*2+1] = i;

705:         }

706: 

707:         // sort, larger in front

708:         qsort(loadNums, (int)totalLoads, sizeof(DWORD64)*2, compare3);

709: 

710:         for (i = 0; i < totalLoads; i++)

711:                 totalReads+=loadNums[i*2];

712: 

713:         for (i = 0; i < totalLoads; i++) {

714:                 countReads+=loadNums[i*2];

715:                 if (((double)(int)countReads) / ((double)(int)totalReads) > percentReads) {

716:                         readsUsed = i+1;

717:                         break;

718:                 }

719:         }

720: 

721:         cout << readsUsed << " Loads Used" << endl;

722:         if (readsUsed == -1) { cout << "Bad number of loads.  Perhaps file does not exist?  Program exit." << endl;        exit(1); }

723:         DWORD64* loadGroup = new DWORD64 [(int)totalLoads];

724:         if (loadGroup == NULL) cout << "Not enough memory for loadGroup" << endl;

725:         memset(loadGroup, 0, sizeof(DWORD64)*(int)totalLoads);

726: 

727:         for (i = 0; i < readsUsed; i++)

728:                 loadGroup[loadNums[i*2+1]] = i+1;  // assign group numbers to loads that will be used

729:         delete [] loadNums;

730: 

731: 

732: 

733:         //

734:         // Build a file for each of the significant loads

735:         // and analyze the values from that load

736:         //

737: 

738:         int* count = new int[numFiles];

739:         if (count == NULL) cout << "Not enough memory for count" << endl;

740:         memset(count, 0, sizeof(int)*numFiles);

741: 

742:         DWORD64* buff = new DWORD64[numFiles*writeBuffSize];

743:         if (buff == NULL) { cout << "Not enough memory for buff" << endl; return; }

744:         memset(buff, 0, sizeof(DWORD64)*numFiles*writeBuffSize);

745:         

746:         int base = 0;

747:         while (base < readsUsed) { // take it a group of files at a time

748: 

749:                 memset(buff, 0, sizeof(DWORD64)*numFiles*writeBuffSize);

750:                 memset(count, 0, sizeof(int)*numFiles);

751: 

752:                 int max = base + numFiles;

753:                 if (readsUsed < max)

754:                         max = readsUsed;

755: 

756:                 cout << "Building loads " << base << " to " << max-1 << endl;

757: 

758:                 ofstream* tempFile = new ofstream[numFiles];

759:                 char filename[100] = {0};

760: 

761:                 for (i = 0; i < max-base; i++) {

762:                         sprintf(filename, "load%04d.data", i);

763:                         tempFile[i].open(filename, ios::out | ios::binary);

764:                 }

765:                 DWORD64 group;

766:                 DWORD64 mgroup;

767:                 traceFile->open(dataFileName, ios::in | ios::binary);

768:                 traceFile->read((char*)&temp, sizeof(int));

769:                  while (!traceFile->eof()) {

770:                         traceFile->read((char*)readBuff, BUFF_SIZE*2*sizeof(int));

771:                         int valsRead = traceFile->gcount() / sizeof(int);

772: 

773:                         for (i = 0; i < valsRead; i+=2) {

774:                                 if (loadGroup[readBuff[i]] > 0) { // used in file

775:                                         group = loadGroup[readBuff[i]]-1; // get load number

776:                                         if (base <= group && group < max) { // if file is being built

777:                                                 mgroup = group-base;

778:                                                 buff[mgroup*writeBuffSize + count[mgroup]] = readBuff[i+1];

779:                                                 count[mgroup]++;

780:                                                 if (count[mgroup] == writeBuffSize) {

781:                                                         tempFile[mgroup].write((char*)&buff[mgroup*writeBuffSize], sizeof(DWORD64)*count[mgroup]);

782:                                                         count[mgroup]=0;

783:                                                 }

784:                                         }

785:                                 }

786:                         }

787:                 }

788:                 traceFile->close();

789: 

790:                 for (i = 0; i < max-base; i++) {

791:                         tempFile[i].write((char*)&buff[i*writeBuffSize], sizeof(DWORD64)*count[i]);

792:                         tempFile[i].close();

793:                 }

794: 

795:                  for (i = 0; i < max-base; i++) {

796:                         sprintf(filename, "load%04d.data", i);

797:                         Start(filename);

798:                         cout << "Analysis of Load " << i+base << " completed" << endl;

799:                 }

800: 

801:                 delete [] tempFile;

802:                 base += numFiles;

803:         }

804: 

805:         delete [] loadGroup;

806:         delete [] readBuff;

807:         delete [] buff;

808:         delete [] count;

809:         delete traceFile;

810: 

811:         cout << "Printing block data to " << outFileName << " ..." << endl;

812: 

813:         printBlockData(outFileName, dataFileName);

814: 

815:         cout << "Execution Complete" << endl;

816:         return;

817: 

818: }

819: 

820: 

821: 

822: 

823: 

824: 

825: 

826: 

827: int main(int argc, char* argv[]) {

828: 

829:         if (argc != 5) { cout << SYNTAX << endl; return 1; }

830: 

831:         char* dataFileName = argv[1];

832:         char* outFileName = argv[2];

833: 

834:         minSupport = atoi(argv[3]);

835:         minSupportTotal = atoi(argv[4]);

836: 

837:         if (minSupport > minSupportTotal) { cout << SYNTAX << endl; return 1; }

838: 

839:         blockAlloc = blockMax * lineLength;

840:         globalBlock = new int [blockAlloc];

841:         if (globalBlock == NULL) { cout << "Insufficient memory for global information block" << endl; return 1; }

842:         memset(globalBlock, 0, blockAlloc*sizeof(int));

843: 

844:         separateLoads(dataFileName, outFileName);

845: 

846:         delete [] globalBlock;

847: 

848:         return 0;

849: 

850: }

851: 

852: