Maintenance Commands
-prepare [release|report] Compare daily update to archive
-ignore Ignore contents of object in -prepare comparisons
-damaged Report UIDs containing damaged embedded HTML tags
-missing Print list of missing identifiers
-unique File of UIDs for skipping all but last version
Miscellaneous
-head Print before everything else
-tail Print after everything else
-hd Print before each record
-tl Print after each record
Update Candidate Report
cd "$MASTER/Pubmed"
gunzip -c *.xml.gz | xtract -strict -compress -format flush |
rchive -prepare report -ignore DateRevised -archive "$MASTER/Archive" \
-index MedlineCitation/PMID -pattern PubmedArticle
Unnecessary Update Removal
cd "$MASTER/Pubmed"
gunzip -c *.xml.gz | xtract -strict -compress -format flush |
rchive -prepare release -ignore DateRevised -archive "$MASTER/Archive" -index MedlineCitation/PMID \
-head "" -tail "" -pattern PubmedArticle |
xtract -format indent -xml '' \
-doctype '' |
gzip > newupdate.xml.gz
Get Archive UID List
pm-uids "$MASTER/Archive" > complete.uid
Reconstruct List of Versioned PMIDs
cd "$MASTER/Pubmed"
rm -f "$MASTER/Archive/versioned.uid"
gunzip -c *.xml.gz |
xtract -strict -pattern PubmedArticle -if MedlineCitation/PMID@Version -gt 1 \
-element MedlineCitation/PMID > "$MASTER/Archive/versioned.uid"
Reconstruct Release Files
split -a 3 -l 30000 release.uid uids-
n=1
for x in uids-???
do
xmlfile=$(printf "pubmed18n%04d.xml.gz" "$n")
n=$((n+1))
echo "$xmlfile"
cat "$x" |
rchive -fetch "$MASTER/Archive" -head "" -tail "" |
xtract -strict -format indent -xml '' \
-doctype '' |
gzip > "$xmlfile"
done
rm -rf uids-???
Damaged Embedded HTML Tag Search
for fl in *.xml.gz
do
echo "$fl"
gunzip -c "$fl" | rchive -mixed -damaged -index MedlineCitation/PMID^Version -pattern PubmedArticle
done
grep -v pubmed18n | grep AMPER | cut -f 1,6
Reconstruct Term List Keys
rm -f "$MASTER/Postings/sections.txt"
find "$MASTER/Postings" -name "*.mst" |
sed -e 's,.*/\(.*\)\.mst,\1,' |
sort | uniq > "$MASTER/Postings/sections.txt"
Generate Term List Paths
find "$MASTER/Postings" -name "*.trm" |
sed -e 's,\(.*/\)\(.*\.trm\),\1 \2,' |
sort -k 2 | uniq | tr -d ' '