Let the code speak for itself.
Java 8
public class Application {
public static void main(String[] args) throws IOException {
Map wordCounts = Files.lines(new File("samples.txt").toPath())
.parallel()
.flatMap(Pattern.compile("\\W")::splitAsStream)
.filter(word -> word.length() > 6)
.map(String::toUpperCase)
.collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));
wordCounts.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.limit(20)
.forEach(System.out::println);
}
}
Golang
type WordOccourences struct {
word string
occourcenres int
}
func main() {
expression := regexp.MustCompile("[^a-zA-ZüäößÜÄÖ]")
data, error := os.Open("samples.txt")
defer data.Close()
if error == nil {
var wordCounts = make(map[string]int)
scanner := bufio.NewScanner(data)
scanner.Split(bufio.ScanWords)
for scanner.Scan() {
if len(scanner.Text()) > 5 {
sanitized := expression.ReplaceAllString(scanner.Text(), "")
wordCounts[strings.ToUpper(sanitized)]++
}
}
var wordlist = make([]WordOccourences, len(wordCounts))
var wordListPos = 0
for k, v := range wordCounts {
if v > 20 {
wordlist[wordListPos] = WordOccourences{k,v}
wordListPos++
}
}
sort.Slice(wordlist, func(i, j int) bool { return wordlist[i].occourcenres > wordlist[j].occourcenres })
for idx := 0 ; idx < wordListPos && idx <=20 ; idx++ {
fmt.Printf("%s=%d\n", wordlist[idx].word , wordlist[idx].occourcenres)
}
}
}
Scala
class CountingWords {
def countFile(filename : String) = {
(scala.io.Source.fromFile(filename)
.getLines)
.flatMap(_.split("\\W+"))
.filter(_.length > 6)
.map(_.toUpperCase)
.foldLeft(Map.empty[String, Int]) {
(count, word) => count + (word -> (count.getOrElse(word,0) +1))
}
.toList
.sortBy(-_._2)
.take(20)
}
}
R
words <- scan("samples.txt", what="list")
words <- toupper(gsub("[^[:alpha:]]","", words))
words <- words[nchar(words)>5]
words <- table(words)
occourences <- sort(words, decreasing = TRUE)
occourences[occourences > 10]
C++
namespace wordfrequency {
typedef std::unordered_map<string, int> wordFrequencyMap;
typedef std::pair<string, int> wordFrequencyPair;
typedef std::vector<wordFrequencyPair> wordFrequencyVector;
class WordCounter {
private:
string fileName;
std::unordered_map<string, int> data;
wordFrequencyVector exportData;
int ignoreLessThan = 0;
int minimumWordLength = 0;
public:
WordCounter();
WordCounter(const string& fileName);
WordCounter(const WordCounter& orig);
virtual ~WordCounter();
WordCounter& count();
WordCounter& ignoreCountsLessThan(int numOfWords);
WordCounter& sortAscending();
WordCounter& sortDescending();
WordCounter& ignoreWordsShorterThanChars(int numOfChars);
const wordFrequencyVector get();
}
namespace wordfrequency {
WordCounter::WordCounter() {
}
WordCounter::WordCounter(const WordCounter& orig) {
}
WordCounter::~WordCounter() {
}
WordCounter::WordCounter(const string& fileName) {
this->fileName.assign(fileName);
}
WordCounter& WordCounter::count() {
boost::regex printableChars("[^a-zA-ZüäößÜÄÖ]");
ifstream file;
file.open(this->fileName);
std::string word;
while (file >> word) {
string wordCleaned = boost::regex_replace(word, printableChars, "");
if (wordCleaned.size() >= this->minimumWordLength) {
boost::to_upper(wordCleaned);
this->data[wordCleaned]++;
}
}
for (std::unordered_map<std::string, int>::iterator it = this->data.begin(); it != this->data.end(); ++it) {
if(it->second >= this->ignoreLessThan) {
this->exportData.push_back(*it);
}
}
return *this;
}
WordCounter& WordCounter::ignoreCountsLessThan(int numOfWords) {
this->ignoreLessThan = numOfWords;
return *this;
}
WordCounter& WordCounter::sortAscending() {
sort(this->exportData.begin(), this->exportData.end(), [ ](wordFrequencyPair& a, wordFrequencyPair& b) {
return a.second < b.second;
});
return *this;
}
WordCounter& WordCounter::sortDescending() {
sort(this->exportData.begin(), this->exportData.end(), [ ](wordFrequencyPair& a, wordFrequencyPair& b) {
return a.second > b.second;
});
return *this;
}
WordCounter& WordCounter::ignoreWordsShorterThanChars(int numOfChars) {
this->minimumWordLength = numOfChars;
}
const wordFrequencyVector WordCounter::get() {
return this->exportData;
}
}
int main(int argc, char** argv) {
WordCounter wordCounter("sample.txt");
wordFrequencyVector wfm = wordCounter
.ignoreCountsLessThan(100)
.ignoreWordsShorterThanChars(6)
.count()
.sortDescending()
.get();
for (wordFrequencyVector::iterator it = wfm.begin(); it != wfm.end(); ++it) {
std::cout << ' ' << it->first << " " << it->second << std::endl;
}
return 0;
}
3 Minutes