Finding missing images on nodes in given time range

We encourage users to post events happening in the community to the community events group on https://www.drupal.org.
mikeytown2's picture

We had an issue where our NFS drive ran out of space and nodes where being created with bad file data. This occurred on 2012/5/21. We needed a way to find all nodes that have bad file data. Below is that solution.

Note that this script will add indexes for the fid to CCK tables. If not doing this, the big query never finished.

Hope someone finds this useful :)

<?php
  $joins
= array();
 
$selects = array();
 
$indexes = array();
  foreach (
content_types() as $node_type => $info) {
    foreach (
$info['fields'] as $field_name => $field) {
     
// Skip if not a filefield.
     
if ($field['type'] != 'filefield') {
        continue;
      }

     
// Get db info from CCK.
     
$db_info = content_database_info($field);
     
$alias = $db_info['table'];
     
$column = $db_info['columns']['fid']['column'];

     
// Add a,b,c to the alias name if table already used.
     
$character = 97; // 97 = a.
     
while (isset($joins[$alias])) {
       
$alias .= chr($character);
       
$character++;
      }

     
// Generate the joins and the selects.
     
$joins[$alias] = 'LEFT JOIN {' . $db_info['table'] . '} AS ' . $alias . ' ON ' . $alias . '.' . $column . ' = files.fid';
     
$selects[$alias] = $alias . '.nid';

     
// Generate the database index queries.
     
if (strpos($db_info['table'], 'content_field') === 0) {
       
$index = 'fid';
      }
      else {
       
$index = substr($column, 6);
      }
     
$indexes[$alias] = 'ALTER TABLE {' . $db_info['table'] . '} ADD INDEX ' . $index . ' ( '  . $column . ' ) ';
    }
  }

 
// Add indexes to database so big query will run.
 
foreach ($indexes as $sql) {
   
db_query($sql);
  }

 
// 2012/5/20 - 2012/5/22
 
$start = 1337472000;
 
$end = 1337644800;

 
// Generate large query.
 
$query = "
  SELECT
  COALESCE ("
. implode(", ", $selects) . ") as nid,
    files.filepath,
    files.filemime
  FROM files
  "
. implode("\n", $joins) . "
  WHERE COALESCE ("
. implode(", ", $selects) . ") IS NOT NULL
  AND files.timestamp BETWEEN "
. $start . " AND " . $end;

 
// Run query
 
$query = db_query($query);
 
$bad_nodes = array();
 
$count = 0;
  while (
$row = db_fetch_array($query)) {
    if (!isset(
$bad_nodes[$row['nid']])) {
     
$count++;
    }
   
// Make sure file exists and filesize if greater than 0.
   
if (file_exists($row['filepath']) && filesize($row['filepath']) > 0) {
     
// If file is an image, make sure it is not corrupted
     
if (strpos($row['filemime'], 'image/') !== FALSE) {
        if (
getimagesize($row['filepath'])) {
          continue;
        }
      }
     
// If file is not an image assume it is good.
     
else {
        continue;
      }
    }

   
// Add image to the bad nodes list.
   
$bad_nodes[$row['nid']][] = $row['filepath'];
  }

 
// Output results.
 
echo count($bad_nodes) . ' out of ' . $count . ' nodes have bad image data ' . "<br />\n";
  echo
my_print($bad_nodes);

// Function taken from drupal.org/project/httprl
// httprl_pr()
/**
* Pretty print data.
*
* @param $data
*   Data In.
* @return
*   Human readable HTML version of the data.
*/
function my_print($data) {
 
// Get extra arguments passed in.
 
$data = func_get_args();
  if (
count($data) == 1) {
   
$data = array_pop($data);
  }

 
// Remove non UTF-8 Characters, escape HTML markup, remove extra new lines.
 
$output = array_filter(explode("\n", htmlentities(iconv('utf-8', 'utf-8//IGNORE', print_r($data, TRUE)), ENT_QUOTES, 'UTF-8')));

 
// Whitespace compression.
 
foreach ($output as $key => $value) {
    if (
str_replace('    ', '', $value) == "(") {
     
$output[$key-1] .= ' (';
      unset(
$output[$key]);
    }
  }

 
// Replace whitespace with html markup.
 
$output = str_replace('    ', '&nbsp;&nbsp;&nbsp;&nbsp;', nl2br(implode("\n", $output))) . '<br />';
  return
$output;
}
?>