From 5b14a3b8de23f6fde9edb206fad9fe0f9c1ebd62 Mon Sep 17 00:00:00 2001 From: Xargin Date: Fri, 11 Dec 2015 15:02:03 +0800 Subject: [PATCH 1/2] Revert "Revert "8.8 done"" This reverts commit ee87a6dc8b2801119a36ae2aad0ef1fa2c04ce66. --- ch8/ch8-08.md | 184 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 183 insertions(+), 1 deletion(-) diff --git a/ch8/ch8-08.md b/ch8/ch8-08.md index c72718e..b81cbf9 100644 --- a/ch8/ch8-08.md +++ b/ch8/ch8-08.md @@ -1,3 +1,185 @@ ## 8.8. 示例: 併髮的字典遍歷 -TODO +In this section, we’ll build a program that reports the disk usage of one or more directories specified on the command line, like the Unix du command. Most of its work is done by the walkDir function below, which enumerates the entries of the directory dir using the dirents helper function. + +```go +gopl.io/ch8/du1 +// walkDir recursively walks the file tree rooted at dir +// and sends the size of each found file on fileSizes. +func walkDir(dir string, fileSizes chan<- int64) { + for _, entry := range dirents(dir) { + if entry.IsDir() { + subdir := filepath.Join(dir, entry.Name()) + walkDir(subdir, fileSizes) + } else { + fileSizes <- entry.Size() + } + } +} + +// dirents returns the entries of directory dir. +func dirents(dir string) []os.FileInfo { + entries, err := ioutil.ReadDir(dir) + if err != nil { + fmt.Fprintf(os.Stderr, "du1: %v\n", err) + return nil + } + return entries +} +``` + +he ioutil.ReadDir function returns a slice of os.FileInfo—the same information that a call to os.Stat returns for a single file. For each subdirectory, walkDir recursively calls itself, and for each file, walkDir sends a message on the fileSizes channel. The message is the size of the file in bytes. + +The main function, shown below, uses two goroutines. The background goroutine calls walkDir for each directory specified on the command line and finally closes the fileSizes channel. The main goroutine computes the sum of the file sizes it receives from the channel and finally prints the total. + + +```go +package main + +import ( + "flag" + "fmt" + "io/ioutil" + "os" + "path/filepath" +) + +func main() { + // Determine the initial directories. + flag.Parse() + roots := flag.Args() + if len(roots) == 0 { + roots = []string{"."} + } + + // Traverse the file tree. + fileSizes := make(chan int64) + go func() { + for _, root := range roots { + walkDir(root, fileSizes) + } + close(fileSizes) + }() + + // Print the results. + var nfiles, nbytes int64 + for size := range fileSizes { + nfiles++ + nbytes += size + } + printDiskUsage(nfiles, nbytes) +} + +func printDiskUsage(nfiles, nbytes int64) { + fmt.Printf("%d files %.1f GB\n", nfiles, float64(nbytes)/1e9) +} + +``` +This program pauses for a long while before printing its result: +``` +$ go build gopl.io/ch8/du1 +$ ./du1 $HOME /usr /bin /etc +213201 files 62.7 GB +``` + +The program would be nicer if it kept us informed of its progress. However, simply moving the printDiskUsage call into the loop would cause it to print thousands of lines of output. +The variant of du below prints the totals periodically, but only if the -v flag is specified since not all users will want to see progress messages. The background goroutine that loops over roots remains unchanged. The main goroutine now uses a ticker to generate events every 500ms, and a select statement to wait for either a file size message, in which case it updates the totals, or a tick event, in which case it prints the current totals. If the -v flag is not specified, the tick channel remains nil, and its case in the select is effectively disabled. + +```go +gopl.io/ch8/du2 +var verbose = flag.Bool("v", false, "show verbose progress messages") + +func main() { + // ...start background goroutine... + + // Print the results periodically. + var tick <-chan time.Time + if *verbose { + tick = time.Tick(500 * time.Millisecond) + } + var nfiles, nbytes int64 +loop: + for { + select { + case size, ok := <-fileSizes: + if !ok { + break loop // fileSizes was closed + } + nfiles++ + nbytes += size + case <-tick: + printDiskUsage(nfiles, nbytes) + } + } + printDiskUsage(nfiles, nbytes) // final totals +} +``` +Since the program no longer uses a range loop, the first select case must explicitly test whether the fileSizes channel has been closed, using the two-result form of receive opera- tion. If the channel has been closed, the program breaks out of the loop. The labeled break statement breaks out of both the select and the for loop; an unlabeled break would break out of only the select, causing the loop to begin the next iteration. + +The program now gives us a leisurely stream of updates: + + +$ go build gopl.io/ch8/du2 +$ ./du2 -v $HOME /usr /bin /etc +28608 files 8.3 GB +54147 files 10.3 GB +93591 files 15.1 GB +127169 files 52.9 GB +175931 files 62.2 GB +213201 files 62.7 GB + +However, it still takes too long to finish. There’s no reason why all the calls to walkDir can’t be done concurrently, thereby exploiting parallelism in the disk system. The third version of du, below, creates a new goroutine for each call to walkDir. It uses a sync.WaitGroup (§8.5) to count the number of calls to walkDir that are still active, and a closer goroutine to close the fileSizes channel when the counter drops to zero. + + +```go +gopl.io/ch8/du3 +func main() { + // ...determine roots... + // Traverse each root of the file tree in parallel. + fileSizes := make(chan int64) + var n sync.WaitGroup + for _, root := range roots { + n.Add(1) + go walkDir(root, &n, fileSizes) + } + go func() { + n.Wait() + close(fileSizes) + }() + // ...select loop... +} + +func walkDir(dir string, n *sync.WaitGroup, fileSizes chan<- int64) { + defer n.Done() + for _, entry := range dirents(dir) { + if entry.IsDir() { + n.Add(1) + subdir := filepath.Join(dir, entry.Name()) + go walkDir(subdir, n, fileSizes) + } else { + fileSizes <- entry.Size() + } + } +} +``` + +Since this program creates many thousands of goroutines at its peak, we have to change dirents to use a counting semaphore to prevent it from opening too many files at once, just as we did for the web crawler in Section 8.6: + + +```go +// sema is a counting semaphore for limiting concurrency in dirents. +var sema = make(chan struct{}, 20) + +// dirents returns the entries of directory dir. +func dirents(dir string) []os.FileInfo { + sema <- struct{}{} // acquire token + defer func() { <-sema }() // release token + // ... + +``` + +This version runs several times faster than the previous one, though there is a lot of variability from system to system. + +Exercise 8.9: Write a version of du that computes and periodically displays separate totals for each of the root directories. + + From 24a99ea7202e3583a57fb3f1003c000394c10355 Mon Sep 17 00:00:00 2001 From: Xargin Date: Fri, 11 Dec 2015 15:48:07 +0800 Subject: [PATCH 2/2] 8.8 done --- ch8/ch8-08.md | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/ch8/ch8-08.md b/ch8/ch8-08.md index b81cbf9..d43d914 100644 --- a/ch8/ch8-08.md +++ b/ch8/ch8-08.md @@ -1,6 +1,6 @@ ## 8.8. 示例: 併髮的字典遍歷 -In this section, we’ll build a program that reports the disk usage of one or more directories specified on the command line, like the Unix du command. Most of its work is done by the walkDir function below, which enumerates the entries of the directory dir using the dirents helper function. +在本小节中,我们会创建一个程序来生成指定目录的硬盘使用情况报告,这个程序和Unix里的du工具比较相似。大多数工作用下面这个walkDir函数来完成,这个函数使用dirents函数来枚举一个目录下的所有入口。 ```go gopl.io/ch8/du1 @@ -28,9 +28,9 @@ func dirents(dir string) []os.FileInfo { } ``` -he ioutil.ReadDir function returns a slice of os.FileInfo—the same information that a call to os.Stat returns for a single file. For each subdirectory, walkDir recursively calls itself, and for each file, walkDir sends a message on the fileSizes channel. The message is the size of the file in bytes. +ioutil.ReadDir函数会返回一个os.FileInfo类型的slice,os.FileInfo类型也是os.Stat这个函数的返回值。对每一个子目录而言,walkDir会递归地调用其自身,并且会对每一个文件也递归调用。walkDir函数会向fileSizes这个channel发送一条消息。这条消息包含了文件的字节大小。 -The main function, shown below, uses two goroutines. The background goroutine calls walkDir for each directory specified on the command line and finally closes the fileSizes channel. The main goroutine computes the sum of the file sizes it receives from the channel and finally prints the total. +下面的主函数,用了两个goroutine。后台的goroutine调用walkDir来遍历命令行给出的每一个路径并最终关闭fileSizes这个channel。主goroutine会对其从channel中接收到的文件大小进行累加,并输出其和。 ```go @@ -75,15 +75,16 @@ func printDiskUsage(nfiles, nbytes int64) { } ``` -This program pauses for a long while before printing its result: +这个程序会在打印其结果之前卡住很长时间。 ``` $ go build gopl.io/ch8/du1 $ ./du1 $HOME /usr /bin /etc 213201 files 62.7 GB ``` -The program would be nicer if it kept us informed of its progress. However, simply moving the printDiskUsage call into the loop would cause it to print thousands of lines of output. -The variant of du below prints the totals periodically, but only if the -v flag is specified since not all users will want to see progress messages. The background goroutine that loops over roots remains unchanged. The main goroutine now uses a ticker to generate events every 500ms, and a select statement to wait for either a file size message, in which case it updates the totals, or a tick event, in which case it prints the current totals. If the -v flag is not specified, the tick channel remains nil, and its case in the select is effectively disabled. +如果在运行的时候能够让我们知道处理进度的话想必更好。但是,如果简单地把printDiskUsage函数调用移动到循环里会导致其打印出成百上千的输出。 + +下面这个du的变种会间歇打印内容,不过只有在调用时提供了-v的flag才会显示程序进度信息。在roots目录上循环的后台goroutine在这里保持不变。主goroutine现在使用了计时器来每500ms生成事件,然后用select语句来等待文件大小的消息来更新总大小数据,或者一个计时器的事件来打印当前的总大小数据。如果-v的flag在运行时没有传入的话,tick这个channel会保持为nil,这样在select里的case也就相当于被禁用了。 ```go gopl.io/ch8/du2 @@ -114,11 +115,11 @@ loop: printDiskUsage(nfiles, nbytes) // final totals } ``` -Since the program no longer uses a range loop, the first select case must explicitly test whether the fileSizes channel has been closed, using the two-result form of receive opera- tion. If the channel has been closed, the program breaks out of the loop. The labeled break statement breaks out of both the select and the for loop; an unlabeled break would break out of only the select, causing the loop to begin the next iteration. - -The program now gives us a leisurely stream of updates: +由于我们的程序不再使用range循环,第一个select的case必须显式地判断fileSizes的channel是不是已经被关闭了,这里可以用到channel接收的二值形式。如果channel已经被关闭了的话,程序会直接退出循环。这里的break语句用到了标签break,这样可以同时终结select和for两个循环;如果没有用标签就break的话只会退出内层的select循环,而外层的for循环会使之进入下一轮select循环。 +现在程序会悠闲地为我们打印更新流: +``` $ go build gopl.io/ch8/du2 $ ./du2 -v $HOME /usr /bin /etc 28608 files 8.3 GB @@ -127,9 +128,9 @@ $ ./du2 -v $HOME /usr /bin /etc 127169 files 52.9 GB 175931 files 62.2 GB 213201 files 62.7 GB +``` -However, it still takes too long to finish. There’s no reason why all the calls to walkDir can’t be done concurrently, thereby exploiting parallelism in the disk system. The third version of du, below, creates a new goroutine for each call to walkDir. It uses a sync.WaitGroup (§8.5) to count the number of calls to walkDir that are still active, and a closer goroutine to close the fileSizes channel when the counter drops to zero. - +然而这个程序还是会花上很长时间才会结束。无法对walkDir做并行化处理没什么别的原因,无非是因为磁盘系统并行限制。下面这个第三个版本的du,会对每一个walkDir的调用创建一个新的goroutine。它使用sync.WaitGroup (§8.5)来对仍旧活跃的walkDir调用进行计数,另一个goroutine会在计数器减为零的时候将fileSizes这个channel关闭。 ```go gopl.io/ch8/du3 @@ -163,7 +164,7 @@ func walkDir(dir string, n *sync.WaitGroup, fileSizes chan<- int64) { } ``` -Since this program creates many thousands of goroutines at its peak, we have to change dirents to use a counting semaphore to prevent it from opening too many files at once, just as we did for the web crawler in Section 8.6: +由于这个程序在高峰期会创建成百上千的goroutine,我们需要修改dirents函数,用计数信号量来阻止他同时打开太多的文件,就像我们在8.7节中的并发爬虫一样: ```go @@ -178,8 +179,8 @@ func dirents(dir string) []os.FileInfo { ``` -This version runs several times faster than the previous one, though there is a lot of variability from system to system. +这个版本比之前那个快了好几倍,尽管其具体效率还是和你的运行环境,机器配置相关。 -Exercise 8.9: Write a version of du that computes and periodically displays separate totals for each of the root directories. +练习8.9: 编写一个du工具,每隔一段时间将root目录下的目录大小计算并显示出来。