mirror of
https://github.com/gopl-zh/gopl-zh.github.com.git
synced 2025-08-16 03:21:33 +00:00
ch8: fix code format
This commit is contained in:
106
ch8/ch8-06.md
106
ch8/ch8-06.md
@@ -2,15 +2,15 @@
|
||||
|
||||
在5.6節中,我們做了一個簡單的web爬蟲,用bfs(廣度優先)算法來抓取整個網站。在本節中,我們會讓這個這個爬蟲併行化,這樣每一個彼此獨立的抓取命令可以併行進行IO,最大化利用網絡資源。crawl函數和gopl.io/ch5/findlinks3中的是一樣的。
|
||||
|
||||
<u><i>gopl.io/ch8/crawl1</i></u>
|
||||
```go
|
||||
gopl.io/ch8/crawl1
|
||||
func crawl(url string) []string {
|
||||
fmt.Println(url)
|
||||
list, err := links.Extract(url)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
}
|
||||
return list
|
||||
fmt.Println(url)
|
||||
list, err := links.Extract(url)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
}
|
||||
return list
|
||||
}
|
||||
```
|
||||
|
||||
@@ -18,23 +18,23 @@ func crawl(url string) []string {
|
||||
|
||||
```go
|
||||
func main() {
|
||||
worklist := make(chan []string)
|
||||
worklist := make(chan []string)
|
||||
|
||||
// Start with the command-line arguments.
|
||||
go func() { worklist <- os.Args[1:] }()
|
||||
// Start with the command-line arguments.
|
||||
go func() { worklist <- os.Args[1:] }()
|
||||
|
||||
// Crawl the web concurrently.
|
||||
seen := make(map[string]bool)
|
||||
for list := range worklist {
|
||||
for _, link := range list {
|
||||
if !seen[link] {
|
||||
seen[link] = true
|
||||
go func(link string) {
|
||||
worklist <- crawl(link)
|
||||
}(link)
|
||||
}
|
||||
}
|
||||
}
|
||||
// Crawl the web concurrently.
|
||||
seen := make(map[string]bool)
|
||||
for list := range worklist {
|
||||
for _, link := range list {
|
||||
if !seen[link] {
|
||||
seen[link] = true
|
||||
go func(link string) {
|
||||
worklist <- crawl(link)
|
||||
}(link)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@@ -52,8 +52,7 @@ https://golang.org/doc/
|
||||
https://golang.org/blog/
|
||||
...
|
||||
2015/07/15 18:22:12 Get ...: dial tcp: lookup blog.golang.org: no such host
|
||||
2015/07/15 18:22:12 Get ...: dial tcp 23.21.222.120:443: socket:
|
||||
too many open files
|
||||
2015/07/15 18:22:12 Get ...: dial tcp 23.21.222.120:443: socket: too many open files
|
||||
...
|
||||
```
|
||||
|
||||
@@ -65,54 +64,51 @@ https://golang.org/blog/
|
||||
|
||||
讓我們重寫crawl函數,將對links.Extract的調用操作用獲取、釋放token的操作包裹起來,來確保同一時間對其隻有20個調用。信號量數量和其能操作的IO資源數量應保持接近。
|
||||
|
||||
<u><i>gopl.io/ch8/crawl2</i></u>
|
||||
```go
|
||||
gopl.io/ch8/crawl2
|
||||
// tokens is a counting semaphore used to
|
||||
// enforce a limit of 20 concurrent requests.
|
||||
var tokens = make(chan struct{}, 20)
|
||||
|
||||
func crawl(url string) []string {
|
||||
fmt.Println(url)
|
||||
tokens <- struct{}{} // acquire a token
|
||||
list, err := links.Extract(url)
|
||||
<-tokens // release the token
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
}
|
||||
return list
|
||||
fmt.Println(url)
|
||||
tokens <- struct{}{} // acquire a token
|
||||
list, err := links.Extract(url)
|
||||
<-tokens // release the token
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
}
|
||||
return list
|
||||
}
|
||||
```
|
||||
|
||||
第二個問題是這個程序永遠都不會終止,卽使它已經爬到了所有初始鏈接衍生出的鏈接。(當然,除非你慎重地選擇了合適的初始化URL或者已經實現了練習8.6中的深度限製,你應該還沒有意識到這個問題)。爲了使這個程序能夠終止,我們需要在worklist爲空或者沒有crawl的goroutine在運行時退出主循環。
|
||||
|
||||
|
||||
```go
|
||||
func main() {
|
||||
worklist := make(chan []string)
|
||||
var n int // number of pending sends to worklist
|
||||
worklist := make(chan []string)
|
||||
var n int // number of pending sends to worklist
|
||||
|
||||
// Start with the command-line arguments.
|
||||
n++
|
||||
go func() { worklist <- os.Args[1:] }()
|
||||
// Start with the command-line arguments.
|
||||
n++
|
||||
go func() { worklist <- os.Args[1:] }()
|
||||
|
||||
// Crawl the web concurrently.
|
||||
seen := make(map[string]bool)
|
||||
|
||||
// Crawl the web concurrently.
|
||||
seen := make(map[string]bool)
|
||||
|
||||
for ; n > 0; n-- {
|
||||
list := <-worklist
|
||||
for _, link := range list {
|
||||
if !seen[link] {
|
||||
seen[link] = true
|
||||
n++
|
||||
go func(link string) {
|
||||
worklist <- crawl(link)
|
||||
}(link)
|
||||
}
|
||||
}
|
||||
}
|
||||
for ; n > 0; n-- {
|
||||
list := <-worklist
|
||||
for _, link := range list {
|
||||
if !seen[link] {
|
||||
seen[link] = true
|
||||
n++
|
||||
go func(link string) {
|
||||
worklist <- crawl(link)
|
||||
}(link)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
這個版本中,計算器n對worklist的發送操作數量進行了限製。每一次我們發現有元素需要被發送到worklist時,我們都會對n進行++操作,在向worklist中發送初始的命令行參數之前,我們也進行過一次++操作。這里的操作++是在每啟動一個crawler的goroutine之前。主循環會在n減爲0時終止,這時候説明沒活可榦了。
|
||||
|
Reference in New Issue
Block a user