MediaWiki | PHP Cross Reference | Collaborative Wikis |
Description: Adds blobs from a given external storage cluster to the blob_tracking table.
1 <?php 2 /** 3 * Adds blobs from a given external storage cluster to the blob_tracking table. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License along 16 * with this program; if not, write to the Free Software Foundation, Inc., 17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 * http://www.gnu.org/copyleft/gpl.html 19 * 20 * @file 21 * @ingroup Maintenance 22 * @see wfWaitForSlaves() 23 */ 24 25 require __DIR__ . '/../commandLine.inc'; 26 27 28 if ( count( $args ) < 1 ) { 29 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n"; 30 echo "Adds blobs from a given ES cluster to the blob_tracking table\n"; 31 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n"; 32 33 exit( 1 ); 34 } 35 $tracker = new TrackBlobs( $args ); 36 $tracker->run(); 37 echo "All done.\n"; 38 39 class TrackBlobs { 40 public $clusters, $textClause; 41 public $doBlobOrphans; 42 public $trackedBlobs = array(); 43 44 public $batchSize = 1000; 45 public $reportingInterval = 10; 46 47 function __construct( $clusters ) { 48 $this->clusters = $clusters; 49 if ( extension_loaded( 'gmp' ) ) { 50 $this->doBlobOrphans = true; 51 foreach ( $clusters as $cluster ) { 52 $this->trackedBlobs[$cluster] = gmp_init( 0 ); 53 } 54 } else { 55 echo "Warning: the gmp extension is needed to find orphan blobs\n"; 56 } 57 } 58 59 function run() { 60 $this->checkIntegrity(); 61 $this->initTrackingTable(); 62 $this->trackRevisions(); 63 $this->trackOrphanText(); 64 if ( $this->doBlobOrphans ) { 65 $this->findOrphanBlobs(); 66 } 67 } 68 69 function checkIntegrity() { 70 echo "Doing integrity check...\n"; 71 $dbr = wfGetDB( DB_SLAVE ); 72 73 // Scan for HistoryBlobStub objects in the text table (bug 20757) 74 75 $exists = $dbr->selectField( 'text', 1, 76 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' . 77 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'', 78 __METHOD__ 79 ); 80 81 if ( $exists ) { 82 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" . 83 "This script could destroy these objects if it continued. Run resolveStubs.php\n" . 84 "to fix this.\n"; 85 exit( 1 ); 86 } 87 88 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624) 89 $flags = $dbr->selectField( 'archive', 'ar_flags', 90 'ar_flags LIKE \'%external%\' OR (' . 91 'ar_flags LIKE \'%object%\' ' . 92 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )', 93 __METHOD__ 94 ); 95 96 if ( strpos( $flags, 'external' ) !== false ) { 97 echo "Integrity check failed: found external storage pointers in your archive table.\n" . 98 "Run normaliseArchiveTable.php to fix this.\n"; 99 exit( 1 ); 100 } elseif ( $flags ) { 101 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" . 102 "These objects are probably already broken, continuing would make them\n" . 103 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n"; 104 exit( 1 ); 105 } 106 107 echo "Integrity check OK\n"; 108 } 109 110 function initTrackingTable() { 111 $dbw = wfGetDB( DB_MASTER ); 112 if ( $dbw->tableExists( 'blob_tracking' ) ) { 113 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) ); 114 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) ); 115 } 116 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' ); 117 } 118 119 function getTextClause() { 120 if ( !$this->textClause ) { 121 $dbr = wfGetDB( DB_SLAVE ); 122 $this->textClause = ''; 123 foreach ( $this->clusters as $cluster ) { 124 if ( $this->textClause != '' ) { 125 $this->textClause .= ' OR '; 126 } 127 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() ); 128 } 129 } 130 return $this->textClause; 131 } 132 133 function interpretPointer( $text ) { 134 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) { 135 return false; 136 } 137 return array( 138 'cluster' => $m[1], 139 'id' => intval( $m[2] ), 140 'hash' => isset( $m[3] ) ? $m[3] : null 141 ); 142 } 143 144 /** 145 * Scan the revision table for rows stored in the specified clusters 146 */ 147 function trackRevisions() { 148 $dbw = wfGetDB( DB_MASTER ); 149 $dbr = wfGetDB( DB_SLAVE ); 150 151 $textClause = $this->getTextClause(); 152 $startId = 0; 153 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ ); 154 $batchesDone = 0; 155 $rowsInserted = 0; 156 157 echo "Finding revisions...\n"; 158 159 while ( true ) { 160 $res = $dbr->select( array( 'revision', 'text' ), 161 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ), 162 array( 163 'rev_id > ' . $dbr->addQuotes( $startId ), 164 'rev_text_id=old_id', 165 $textClause, 166 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ), 167 ), 168 __METHOD__, 169 array( 170 'ORDER BY' => 'rev_id', 171 'LIMIT' => $this->batchSize 172 ) 173 ); 174 if ( !$res->numRows() ) { 175 break; 176 } 177 178 $insertBatch = array(); 179 foreach ( $res as $row ) { 180 $startId = $row->rev_id; 181 $info = $this->interpretPointer( $row->old_text ); 182 if ( !$info ) { 183 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n"; 184 continue; 185 } 186 if ( !in_array( $info['cluster'], $this->clusters ) ) { 187 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n"; 188 continue; 189 } 190 $insertBatch[] = array( 191 'bt_page' => $row->rev_page, 192 'bt_rev_id' => $row->rev_id, 193 'bt_text_id' => $row->old_id, 194 'bt_cluster' => $info['cluster'], 195 'bt_blob_id' => $info['id'], 196 'bt_cgz_hash' => $info['hash'] 197 ); 198 if ( $this->doBlobOrphans ) { 199 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); 200 } 201 } 202 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ ); 203 $rowsInserted += count( $insertBatch ); 204 205 ++$batchesDone; 206 if ( $batchesDone >= $this->reportingInterval ) { 207 $batchesDone = 0; 208 echo "$startId / $endId\n"; 209 wfWaitForSlaves(); 210 } 211 } 212 echo "Found $rowsInserted revisions\n"; 213 } 214 215 /** 216 * Scan the text table for orphan text 217 * Orphan text here does not imply DB corruption -- deleted text tracked by the 218 * archive table counts as orphan for our purposes. 219 */ 220 function trackOrphanText() { 221 # Wait until the blob_tracking table is available in the slave 222 $dbw = wfGetDB( DB_MASTER ); 223 $dbr = wfGetDB( DB_SLAVE ); 224 $pos = $dbw->getMasterPos(); 225 $dbr->masterPosWait( $pos, 100000 ); 226 227 $textClause = $this->getTextClause( $this->clusters ); 228 $startId = 0; 229 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ ); 230 $rowsInserted = 0; 231 $batchesDone = 0; 232 233 echo "Finding orphan text...\n"; 234 235 # Scan the text table for orphan text 236 while ( true ) { 237 $res = $dbr->select( array( 'text', 'blob_tracking' ), 238 array( 'old_id', 'old_flags', 'old_text' ), 239 array( 240 'old_id>' . $dbr->addQuotes( $startId ), 241 $textClause, 242 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ), 243 'bt_text_id IS NULL' 244 ), 245 __METHOD__, 246 array( 247 'ORDER BY' => 'old_id', 248 'LIMIT' => $this->batchSize 249 ), 250 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) ) 251 ); 252 $ids = array(); 253 foreach ( $res as $row ) { 254 $ids[] = $row->old_id; 255 } 256 257 if ( !$res->numRows() ) { 258 break; 259 } 260 261 $insertBatch = array(); 262 foreach ( $res as $row ) { 263 $startId = $row->old_id; 264 $info = $this->interpretPointer( $row->old_text ); 265 if ( !$info ) { 266 echo "Invalid DB:// URL in old_id {$row->old_id}\n"; 267 continue; 268 } 269 if ( !in_array( $info['cluster'], $this->clusters ) ) { 270 echo "Invalid cluster returned in SQL query\n"; 271 continue; 272 } 273 274 $insertBatch[] = array( 275 'bt_page' => 0, 276 'bt_rev_id' => 0, 277 'bt_text_id' => $row->old_id, 278 'bt_cluster' => $info['cluster'], 279 'bt_blob_id' => $info['id'], 280 'bt_cgz_hash' => $info['hash'] 281 ); 282 if ( $this->doBlobOrphans ) { 283 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); 284 } 285 } 286 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ ); 287 288 $rowsInserted += count( $insertBatch ); 289 ++$batchesDone; 290 if ( $batchesDone >= $this->reportingInterval ) { 291 $batchesDone = 0; 292 echo "$startId / $endId\n"; 293 wfWaitForSlaves(); 294 } 295 } 296 echo "Found $rowsInserted orphan text rows\n"; 297 } 298 299 /** 300 * Scan the blobs table for rows not registered in blob_tracking (and thus not 301 * registered in the text table). 302 * 303 * Orphan blobs are indicative of DB corruption. They are inaccessible and 304 * should probably be deleted. 305 */ 306 function findOrphanBlobs() { 307 if ( !extension_loaded( 'gmp' ) ) { 308 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n"; 309 return; 310 } 311 312 $dbw = wfGetDB( DB_MASTER ); 313 314 foreach ( $this->clusters as $cluster ) { 315 echo "Searching for orphan blobs in $cluster...\n"; 316 $lb = wfGetLBFactory()->getExternalLB( $cluster ); 317 try { 318 $extDB = $lb->getConnection( DB_SLAVE ); 319 } catch ( DBConnectionError $e ) { 320 if ( strpos( $e->error, 'Unknown database' ) !== false ) { 321 echo "No database on $cluster\n"; 322 } else { 323 echo "Error on $cluster: " . $e->getMessage() . "\n"; 324 } 325 continue; 326 } 327 $table = $extDB->getLBInfo( 'blobs table' ); 328 if ( is_null( $table ) ) { 329 $table = 'blobs'; 330 } 331 if ( !$extDB->tableExists( $table ) ) { 332 echo "No blobs table on cluster $cluster\n"; 333 continue; 334 } 335 $startId = 0; 336 $batchesDone = 0; 337 $actualBlobs = gmp_init( 0 ); 338 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ ); 339 340 // Build a bitmap of actual blob rows 341 while ( true ) { 342 $res = $extDB->select( $table, 343 array( 'blob_id' ), 344 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ), 345 __METHOD__, 346 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ) 347 ); 348 349 if ( !$res->numRows() ) { 350 break; 351 } 352 353 foreach ( $res as $row ) { 354 gmp_setbit( $actualBlobs, $row->blob_id ); 355 } 356 $startId = $row->blob_id; 357 358 ++$batchesDone; 359 if ( $batchesDone >= $this->reportingInterval ) { 360 $batchesDone = 0; 361 echo "$startId / $endId\n"; 362 } 363 } 364 365 // Find actual blobs that weren't tracked by the previous passes 366 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B 367 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) ); 368 369 // Traverse the orphan list 370 $insertBatch = array(); 371 $id = 0; 372 $numOrphans = 0; 373 while ( true ) { 374 $id = gmp_scan1( $orphans, $id ); 375 if ( $id == -1 ) { 376 break; 377 } 378 $insertBatch[] = array( 379 'bo_cluster' => $cluster, 380 'bo_blob_id' => $id 381 ); 382 if ( count( $insertBatch ) > $this->batchSize ) { 383 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ ); 384 $insertBatch = array(); 385 } 386 387 ++$id; 388 ++$numOrphans; 389 } 390 if ( $insertBatch ) { 391 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ ); 392 } 393 echo "Found $numOrphans orphan(s) in $cluster\n"; 394 } 395 } 396 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
title