最终是三个文件: queries.txt.formated fashion_quries__high_quality.txt.v2.uniq fashion_quries__high_quality.from_tags fashion_quries__high_quality.txt.v2.uniq.trans cat *rmatted.txt | grep ", " | sed 's/, /\n/g' | tr '[:upper:]' '[:lower:]' | sort | uniq -c | sort -k1rn | awk '$1 > 2 {print}' > lowercase_counts.txt awk '{$1=""; print $0}' /data/tw/SearchEngine/docs/dataset/lowercase_counts.txt | sed 's/^ *//' | grep -v '^$' > lowercase_words.txt