README.md
最终是三个文件:
queries.txt.formated fashion_quries_high_quality.txt.v2.uniq fashion_qurieshigh_quality.from_tags fashion_quries_high_quality.txt.v2.uniq.trans
cat *rmatted.txt | grep ", " | sed 's/, /\n/g' | tr '[:upper:]' '[:lower:]' | sort | uniq -c | sort -k1rn | awk '$1 > 2 {print}' > lowercase_counts.txt
awk '{$1=""; print $0}' /data/tw/SearchEngine/docs/dataset/lowercase_counts.txt | sed 's/^ *//' | grep -v '$' > lowercase_words.txt