mirror of
https://github.com/gopl-zh/gopl-zh.github.com.git
synced 2024-11-28 17:19:06 +00:00
62 lines
1.2 KiB
Go
62 lines
1.2 KiB
Go
// Copyright © 2016 Alan A. A. Donovan & Brian W. Kernighan.
|
|
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/
|
|
|
|
// See page 243.
|
|
|
|
// Crawl3 crawls web links starting with the command-line arguments.
|
|
//
|
|
// This version uses bounded parallelism.
|
|
// For simplicity, it does not address the termination problem.
|
|
//
|
|
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
|
|
"gopl.io/ch5/links"
|
|
)
|
|
|
|
func crawl(url string) []string {
|
|
fmt.Println(url)
|
|
list, err := links.Extract(url)
|
|
if err != nil {
|
|
log.Print(err)
|
|
}
|
|
return list
|
|
}
|
|
|
|
//!+
|
|
func main() {
|
|
worklist := make(chan []string) // lists of URLs, may have duplicates
|
|
unseenLinks := make(chan string) // de-duplicated URLs
|
|
|
|
// Add command-line arguments to worklist.
|
|
go func() { worklist <- os.Args[1:] }()
|
|
|
|
// Create 20 crawler goroutines to fetch each unseen link.
|
|
for i := 0; i < 20; i++ {
|
|
go func() {
|
|
for link := range unseenLinks {
|
|
foundLinks := crawl(link)
|
|
go func() { worklist <- foundLinks }()
|
|
}
|
|
}()
|
|
}
|
|
|
|
// The main goroutine de-duplicates worklist items
|
|
// and sends the unseen ones to the crawlers.
|
|
seen := make(map[string]bool)
|
|
for list := range worklist {
|
|
for _, link := range list {
|
|
if !seen[link] {
|
|
seen[link] = true
|
|
unseenLinks <- link
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//!-
|