MediaWiki PHP Cross Reference Collaborative Wikis

Source: /maintenance/storage/trackBlobs.php - 396 lines - 11375 bytes - Summary - Text - Print

Description: Adds blobs from a given external storage cluster to the blob_tracking table.

   1  <?php
   2  /**
   3   * Adds blobs from a given external storage cluster to the blob_tracking table.
   4   *
   5   * This program is free software; you can redistribute it and/or modify
   6   * it under the terms of the GNU General Public License as published by
   7   * the Free Software Foundation; either version 2 of the License, or
   8   * (at your option) any later version.
   9   *
  10   * This program is distributed in the hope that it will be useful,
  11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13   * GNU General Public License for more details.
  14   *
  15   * You should have received a copy of the GNU General Public License along
  16   * with this program; if not, write to the Free Software Foundation, Inc.,
  17   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18   * http://www.gnu.org/copyleft/gpl.html
  19   *
  20   * @file
  21   * @ingroup Maintenance
  22   * @see wfWaitForSlaves()
  23   */
  24  
  25  require  __DIR__ . '/../commandLine.inc';
  26  
  27  
  28  if ( count( $args ) < 1 ) {
  29      echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
  30      echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
  31      echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
  32  
  33      exit( 1 );
  34  }
  35  $tracker = new TrackBlobs( $args );
  36  $tracker->run();
  37  echo "All done.\n";
  38  
  39  class TrackBlobs {
  40      public $clusters, $textClause;
  41      public $doBlobOrphans;
  42      public $trackedBlobs = array();
  43  
  44      public $batchSize = 1000;
  45      public $reportingInterval = 10;
  46  
  47  	function __construct( $clusters ) {
  48          $this->clusters = $clusters;
  49          if ( extension_loaded( 'gmp' ) ) {
  50              $this->doBlobOrphans = true;
  51              foreach ( $clusters as $cluster ) {
  52                  $this->trackedBlobs[$cluster] = gmp_init( 0 );
  53              }
  54          } else {
  55              echo "Warning: the gmp extension is needed to find orphan blobs\n";
  56          }
  57      }
  58  
  59  	function run() {
  60          $this->checkIntegrity();
  61          $this->initTrackingTable();
  62          $this->trackRevisions();
  63          $this->trackOrphanText();
  64          if ( $this->doBlobOrphans ) {
  65              $this->findOrphanBlobs();
  66          }
  67      }
  68  
  69  	function checkIntegrity() {
  70          echo "Doing integrity check...\n";
  71          $dbr = wfGetDB( DB_SLAVE );
  72  
  73          // Scan for HistoryBlobStub objects in the text table (bug 20757)
  74  
  75          $exists = $dbr->selectField( 'text', 1,
  76              'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
  77              'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
  78              __METHOD__
  79          );
  80  
  81          if ( $exists ) {
  82              echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
  83                  "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
  84                  "to fix this.\n";
  85              exit( 1 );
  86          }
  87  
  88          // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
  89          $flags = $dbr->selectField( 'archive', 'ar_flags',
  90              'ar_flags LIKE \'%external%\' OR (' .
  91              'ar_flags LIKE \'%object%\' ' .
  92              'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
  93              __METHOD__
  94          );
  95  
  96          if ( strpos( $flags, 'external' ) !== false ) {
  97              echo "Integrity check failed: found external storage pointers in your archive table.\n" .
  98                  "Run normaliseArchiveTable.php to fix this.\n";
  99              exit( 1 );
 100          } elseif ( $flags ) {
 101              echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
 102                  "These objects are probably already broken, continuing would make them\n" .
 103                  "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
 104              exit( 1 );
 105          }
 106  
 107          echo "Integrity check OK\n";
 108      }
 109  
 110  	function initTrackingTable() {
 111          $dbw = wfGetDB( DB_MASTER );
 112          if ( $dbw->tableExists( 'blob_tracking' ) ) {
 113              $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
 114              $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
 115          }
 116          $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
 117      }
 118  
 119  	function getTextClause() {
 120          if ( !$this->textClause ) {
 121              $dbr = wfGetDB( DB_SLAVE );
 122              $this->textClause = '';
 123              foreach ( $this->clusters as $cluster ) {
 124                  if ( $this->textClause != '' ) {
 125                      $this->textClause .= ' OR ';
 126                  }
 127                  $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
 128              }
 129          }
 130          return $this->textClause;
 131      }
 132  
 133  	function interpretPointer( $text ) {
 134          if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
 135              return false;
 136          }
 137          return array(
 138              'cluster' => $m[1],
 139              'id' => intval( $m[2] ),
 140              'hash' => isset( $m[3] ) ? $m[3] : null
 141          );
 142      }
 143  
 144      /**
 145       *  Scan the revision table for rows stored in the specified clusters
 146       */
 147  	function trackRevisions() {
 148          $dbw = wfGetDB( DB_MASTER );
 149          $dbr = wfGetDB( DB_SLAVE );
 150  
 151          $textClause = $this->getTextClause();
 152          $startId = 0;
 153          $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
 154          $batchesDone = 0;
 155          $rowsInserted = 0;
 156  
 157          echo "Finding revisions...\n";
 158  
 159          while ( true ) {
 160              $res = $dbr->select( array( 'revision', 'text' ),
 161                  array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
 162                  array(
 163                      'rev_id > ' . $dbr->addQuotes( $startId ),
 164                      'rev_text_id=old_id',
 165                      $textClause,
 166                      'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
 167                  ),
 168                  __METHOD__,
 169                  array(
 170                      'ORDER BY' => 'rev_id',
 171                      'LIMIT' => $this->batchSize
 172                  )
 173              );
 174              if ( !$res->numRows() ) {
 175                  break;
 176              }
 177  
 178              $insertBatch = array();
 179              foreach ( $res as $row ) {
 180                  $startId = $row->rev_id;
 181                  $info = $this->interpretPointer( $row->old_text );
 182                  if ( !$info ) {
 183                      echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
 184                      continue;
 185                  }
 186                  if ( !in_array( $info['cluster'], $this->clusters ) ) {
 187                      echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
 188                      continue;
 189                  }
 190                  $insertBatch[] = array(
 191                      'bt_page' => $row->rev_page,
 192                      'bt_rev_id' => $row->rev_id,
 193                      'bt_text_id' => $row->old_id,
 194                      'bt_cluster' => $info['cluster'],
 195                      'bt_blob_id' => $info['id'],
 196                      'bt_cgz_hash' => $info['hash']
 197                  );
 198                  if ( $this->doBlobOrphans ) {
 199                      gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
 200                  }
 201              }
 202              $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
 203              $rowsInserted += count( $insertBatch );
 204  
 205              ++$batchesDone;
 206              if ( $batchesDone >= $this->reportingInterval ) {
 207                  $batchesDone = 0;
 208                  echo "$startId / $endId\n";
 209                  wfWaitForSlaves();
 210              }
 211          }
 212          echo "Found $rowsInserted revisions\n";
 213      }
 214  
 215      /**
 216       * Scan the text table for orphan text
 217       * Orphan text here does not imply DB corruption -- deleted text tracked by the
 218       * archive table counts as orphan for our purposes.
 219       */
 220  	function trackOrphanText() {
 221          # Wait until the blob_tracking table is available in the slave
 222          $dbw = wfGetDB( DB_MASTER );
 223          $dbr = wfGetDB( DB_SLAVE );
 224          $pos = $dbw->getMasterPos();
 225          $dbr->masterPosWait( $pos, 100000 );
 226  
 227          $textClause = $this->getTextClause( $this->clusters );
 228          $startId = 0;
 229          $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
 230          $rowsInserted = 0;
 231          $batchesDone = 0;
 232  
 233          echo "Finding orphan text...\n";
 234  
 235          # Scan the text table for orphan text
 236          while ( true ) {
 237              $res = $dbr->select( array( 'text', 'blob_tracking' ),
 238                  array( 'old_id', 'old_flags', 'old_text' ),
 239                  array(
 240                      'old_id>' . $dbr->addQuotes( $startId ),
 241                      $textClause,
 242                      'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
 243                      'bt_text_id IS NULL'
 244                  ),
 245                  __METHOD__,
 246                  array(
 247                      'ORDER BY' => 'old_id',
 248                      'LIMIT' => $this->batchSize
 249                  ),
 250                  array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
 251              );
 252              $ids = array();
 253              foreach ( $res as $row ) {
 254                  $ids[] = $row->old_id;
 255              }
 256  
 257              if ( !$res->numRows() ) {
 258                  break;
 259              }
 260  
 261              $insertBatch = array();
 262              foreach ( $res as $row ) {
 263                  $startId = $row->old_id;
 264                  $info = $this->interpretPointer( $row->old_text );
 265                  if ( !$info ) {
 266                      echo "Invalid DB:// URL in old_id {$row->old_id}\n";
 267                      continue;
 268                  }
 269                  if ( !in_array( $info['cluster'], $this->clusters ) ) {
 270                      echo "Invalid cluster returned in SQL query\n";
 271                      continue;
 272                  }
 273  
 274                  $insertBatch[] = array(
 275                      'bt_page' => 0,
 276                      'bt_rev_id' => 0,
 277                      'bt_text_id' => $row->old_id,
 278                      'bt_cluster' => $info['cluster'],
 279                      'bt_blob_id' => $info['id'],
 280                      'bt_cgz_hash' => $info['hash']
 281                  );
 282                  if ( $this->doBlobOrphans ) {
 283                      gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
 284                  }
 285              }
 286              $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
 287  
 288              $rowsInserted += count( $insertBatch );
 289              ++$batchesDone;
 290              if ( $batchesDone >= $this->reportingInterval ) {
 291                  $batchesDone = 0;
 292                  echo "$startId / $endId\n";
 293                  wfWaitForSlaves();
 294              }
 295          }
 296          echo "Found $rowsInserted orphan text rows\n";
 297      }
 298  
 299      /**
 300       * Scan the blobs table for rows not registered in blob_tracking (and thus not
 301       * registered in the text table).
 302       *
 303       * Orphan blobs are indicative of DB corruption. They are inaccessible and
 304       * should probably be deleted.
 305       */
 306  	function findOrphanBlobs() {
 307          if ( !extension_loaded( 'gmp' ) ) {
 308              echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
 309              return;
 310          }
 311  
 312          $dbw = wfGetDB( DB_MASTER );
 313  
 314          foreach ( $this->clusters as $cluster ) {
 315              echo "Searching for orphan blobs in $cluster...\n";
 316              $lb = wfGetLBFactory()->getExternalLB( $cluster );
 317              try {
 318                  $extDB = $lb->getConnection( DB_SLAVE );
 319              } catch ( DBConnectionError $e ) {
 320                  if ( strpos( $e->error, 'Unknown database' ) !== false ) {
 321                      echo "No database on $cluster\n";
 322                  } else {
 323                      echo "Error on $cluster: " . $e->getMessage() . "\n";
 324                  }
 325                  continue;
 326              }
 327              $table = $extDB->getLBInfo( 'blobs table' );
 328              if ( is_null( $table ) ) {
 329                  $table = 'blobs';
 330              }
 331              if ( !$extDB->tableExists( $table ) ) {
 332                  echo "No blobs table on cluster $cluster\n";
 333                  continue;
 334              }
 335              $startId = 0;
 336              $batchesDone = 0;
 337              $actualBlobs = gmp_init( 0 );
 338              $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
 339  
 340              // Build a bitmap of actual blob rows
 341              while ( true ) {
 342                  $res = $extDB->select( $table,
 343                      array( 'blob_id' ),
 344                      array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
 345                      __METHOD__,
 346                      array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
 347                  );
 348  
 349                  if ( !$res->numRows() ) {
 350                      break;
 351                  }
 352  
 353                  foreach ( $res as $row ) {
 354                      gmp_setbit( $actualBlobs, $row->blob_id );
 355                  }
 356                  $startId = $row->blob_id;
 357  
 358                  ++$batchesDone;
 359                  if ( $batchesDone >= $this->reportingInterval ) {
 360                      $batchesDone = 0;
 361                      echo "$startId / $endId\n";
 362                  }
 363              }
 364  
 365              // Find actual blobs that weren't tracked by the previous passes
 366              // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
 367              $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
 368  
 369              // Traverse the orphan list
 370              $insertBatch = array();
 371              $id = 0;
 372              $numOrphans = 0;
 373              while ( true ) {
 374                  $id = gmp_scan1( $orphans, $id );
 375                  if ( $id == -1 ) {
 376                      break;
 377                  }
 378                  $insertBatch[] = array(
 379                      'bo_cluster' => $cluster,
 380                      'bo_blob_id' => $id
 381                  );
 382                  if ( count( $insertBatch ) > $this->batchSize ) {
 383                      $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
 384                      $insertBatch = array();
 385                  }
 386  
 387                  ++$id;
 388                  ++$numOrphans;
 389              }
 390              if ( $insertBatch ) {
 391                  $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
 392              }
 393              echo "Found $numOrphans orphan(s) in $cluster\n";
 394          }
 395      }
 396  }

title

Description

title

Description

title

Description

title

title

Body