docs/check_docs_meta_title.sh

   1 #!/bin/bash
   2
   3 # This script will search a website and gather up all of the <title></title> for each page
   4 # the results will land in out.csv
   5 # This is a nice aid to help us find pages that do not have the "right" headings
   6
   7 wget --spider -r -l inf -w .25 -nc -nd $1 -R bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,PNG,JPG,swf,txt,xml,xls,zip 2>&1 | tee wglog
   8
   9 rm out.csv
  10 cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url; do {
  11
  12 printf "%s* Retreiving title for: %s$url%s " "$bldgreen" "$txtrst$txtbld" "$txtrst"
  13 printf ""${url}","`curl -# ${url} | sed -n -E 's!.*<title>(.*)</title>.*!\1!p'`" , " >> out.csv
  14 printf " "
  15 }; done
  16