ch8: fix code format

This commit is contained in:
chai2010
2016-01-21 10:39:06 +08:00
parent 0b5ec941ed
commit b1730821fe
13 changed files with 393 additions and 404 deletions

View File

@@ -2,15 +2,15 @@
在5.6節中我們做了一個簡單的web爬蟲用bfs(廣度優先)算法來抓取整個網站。在本節中我們會讓這個這個爬蟲併行化這樣每一個彼此獨立的抓取命令可以併行進行IO最大化利用網絡資源。crawl函數和gopl.io/ch5/findlinks3中的是一樣的。
<u><i>gopl.io/ch8/crawl1</i></u>
```go
gopl.io/ch8/crawl1
func crawl(url string) []string {
fmt.Println(url)
list, err := links.Extract(url)
if err != nil {
log.Print(err)
}
return list
fmt.Println(url)
list, err := links.Extract(url)
if err != nil {
log.Print(err)
}
return list
}
```
@@ -18,23 +18,23 @@ func crawl(url string) []string {
```go
func main() {
worklist := make(chan []string)
worklist := make(chan []string)
// Start with the command-line arguments.
go func() { worklist <- os.Args[1:] }()
// Start with the command-line arguments.
go func() { worklist <- os.Args[1:] }()
// Crawl the web concurrently.
seen := make(map[string]bool)
for list := range worklist {
for _, link := range list {
if !seen[link] {
seen[link] = true
go func(link string) {
worklist <- crawl(link)
}(link)
}
}
}
// Crawl the web concurrently.
seen := make(map[string]bool)
for list := range worklist {
for _, link := range list {
if !seen[link] {
seen[link] = true
go func(link string) {
worklist <- crawl(link)
}(link)
}
}
}
}
```
@@ -52,8 +52,7 @@ https://golang.org/doc/
https://golang.org/blog/
...
2015/07/15 18:22:12 Get ...: dial tcp: lookup blog.golang.org: no such host
2015/07/15 18:22:12 Get ...: dial tcp 23.21.222.120:443: socket:
too many open files
2015/07/15 18:22:12 Get ...: dial tcp 23.21.222.120:443: socket: too many open files
...
```
@@ -65,54 +64,51 @@ https://golang.org/blog/
讓我們重寫crawl函數將對links.Extract的調用操作用獲取、釋放token的操作包裹起來來確保同一時間對其隻有20個調用。信號量數量和其能操作的IO資源數量應保持接近。
<u><i>gopl.io/ch8/crawl2</i></u>
```go
gopl.io/ch8/crawl2
// tokens is a counting semaphore used to
// enforce a limit of 20 concurrent requests.
var tokens = make(chan struct{}, 20)
func crawl(url string) []string {
fmt.Println(url)
tokens <- struct{}{} // acquire a token
list, err := links.Extract(url)
<-tokens // release the token
if err != nil {
log.Print(err)
}
return list
fmt.Println(url)
tokens <- struct{}{} // acquire a token
list, err := links.Extract(url)
<-tokens // release the token
if err != nil {
log.Print(err)
}
return list
}
```
第二個問題是這個程序永遠都不會終止,卽使它已經爬到了所有初始鏈接衍生出的鏈接。(當然除非你慎重地選擇了合適的初始化URL或者已經實現了練習8.6中的深度限製,你應該還沒有意識到這個問題)。爲了使這個程序能夠終止我們需要在worklist爲空或者沒有crawl的goroutine在運行時退出主循環。
```go
func main() {
worklist := make(chan []string)
var n int // number of pending sends to worklist
worklist := make(chan []string)
var n int // number of pending sends to worklist
// Start with the command-line arguments.
n++
go func() { worklist <- os.Args[1:] }()
// Start with the command-line arguments.
n++
go func() { worklist <- os.Args[1:] }()
// Crawl the web concurrently.
seen := make(map[string]bool)
// Crawl the web concurrently.
seen := make(map[string]bool)
for ; n > 0; n-- {
list := <-worklist
for _, link := range list {
if !seen[link] {
seen[link] = true
n++
go func(link string) {
worklist <- crawl(link)
}(link)
}
}
}
for ; n > 0; n-- {
list := <-worklist
for _, link := range list {
if !seen[link] {
seen[link] = true
n++
go func(link string) {
worklist <- crawl(link)
}(link)
}
}
}
}
```
這個版本中計算器n對worklist的發送操作數量進行了限製。每一次我們發現有元素需要被發送到worklist時我們都會對n進行++操作在向worklist中發送初始的命令行參數之前我們也進行過一次++操作。這里的操作++是在每啟動一個crawler的goroutine之前。主循環會在n減爲0時終止這時候説明沒活可榦了。