package file_download import ( "bytes" "errors" "fmt" "io" "github.com/unidoc/unioffice/document" "net/http" "net/url" "os" "path/filepath" "strings" "time" ) // 下载文件 func DownloadFile(url string, validFunc func(resp *http.Response) error) ([]byte, string, error) { // 设置超时 client := &http.Client{ Timeout: 30 * time.Second, } // 发送请求 resp, err := client.Get(url) if err != nil { return nil, "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) } if validFunc != nil { err = validFunc(resp) if err != nil { return nil, "", err } } // 读取文件数据 data, err := io.ReadAll(resp.Body) if err != nil { return nil, "", err } // 获取文件名 filename := getFilenameFromURL(url, resp) return data, filename, nil } // 从 URL 或响应头获取文件名 func getFilenameFromURL(urlStr string, resp *http.Response) string { // 1. 尝试从 Content-Disposition 头获取 contentDisposition := resp.Header.Get("Content-Disposition") if contentDisposition != "" { if strings.Contains(contentDisposition, "filename=") { parts := strings.Split(contentDisposition, "filename=") if len(parts) > 1 { filename := strings.Trim(parts[1], `"' `) return sanitizeFilename(filename) } } } // 2. 从 URL 路径获取 parsedURL, err := url.Parse(urlStr) if err == nil { path := parsedURL.Path if path != "" { filename := filepath.Base(path) if filename != "" && filename != "." && filename != "/" { return sanitizeFilename(filename) } } } // 3. 生成默认文件名 return fmt.Sprintf("word_%d.docx", time.Now().Unix()) } // 清理文件名 func sanitizeFilename(filename string) string { // 移除非法字符 illegalChars := []string{"/", "\\", ":", "*", "?", "\"", "<", ">", "|"} for _, char := range illegalChars { filename = strings.ReplaceAll(filename, char, "_") } // 确保有扩展名 if !strings.Contains(filename, ".") { filename += ".docx" } return filename } // 从URL获取Word文件的纯文本内容 func GetWordTextFromURL(url string, validFunc func(resp *http.Response) error) (string, string, error) { // 1. 下载文件 data, fileName, err := DownloadFile(url, validFunc) if err != nil { return "", "", fmt.Errorf("下载失败: %w", err) } // 2. 解析Word文件 text, err := parseWordContent(data) if err != nil { return "", "", fmt.Errorf("解析失败: %w", err) } return text, fileName, nil } // 解析Word内容 - 简单版本,只提取文字 func parseWordContent(data []byte) (string, error) { // 创建reader reader := bytes.NewReader(data) // 打开Word文档 doc, err := document.Read(reader, int64(len(data))) if err != nil { return "", err } defer doc.Close() // 提取所有文字 var textBuilder strings.Builder // 遍历所有段落 for _, paragraph := range doc.Paragraphs() { // 遍历段落中的所有文本块 for _, run := range paragraph.Runs() { textBuilder.WriteString(run.Text()) } // 每个段落后面加换行 textBuilder.WriteString("\n") } // 返回清理过的文本 result := strings.TrimSpace(textBuilder.String()) return result, nil } // 解析 Word 文件内容 func parseWordFile(filePath string) (map[string]interface{}, error) { ext := strings.ToLower(filepath.Ext(filePath)) result := map[string]interface{}{ "filepath": filePath, "format": ext, } // 解析 .docx 文件 if ext == ".docx" { doc, err := document.Open(filePath) if err != nil { return nil, err } defer doc.Close() // 提取段落文本 var paragraphs []string for _, p := range doc.Paragraphs() { text := "" for _, run := range p.Runs() { text += run.Text() } if strings.TrimSpace(text) != "" { paragraphs = append(paragraphs, text) } } // 提取表格内容 var tables []map[string]interface{} for _, table := range doc.Tables() { tableData := make(map[string]interface{}) var rows []map[int]string for _, row := range table.Rows() { rowData := make(map[int]string) for cellIdx, cell := range row.Cells() { cellText := "" for _, p := range cell.Paragraphs() { for _, run := range p.Runs() { cellText += run.Text() } } rowData[cellIdx] = cellText } rows = append(rows, rowData) } tableData["rows"] = rows tableData["row_count"] = len(rows) tables = append(tables, tableData) } result["paragraphs"] = paragraphs result["tables"] = tables result["paragraph_count"] = len(paragraphs) result["table_count"] = len(tables) } else if ext == ".doc" { // 对于 .doc 文件,可能需要其他库或转换 // 这里简单读取为二进制文件 data, err := os.ReadFile(filePath) if err != nil { return nil, err } result["binary_size"] = len(data) result["note"] = ".doc 文件需要专门的解析库" } // 获取文件信息 fileInfo, _ := os.Stat(filePath) if fileInfo != nil { result["filesize"] = fileInfo.Size() result["modified"] = fileInfo.ModTime() } return result, nil } // 判断是否为 Word 文件 func IsWordFile(resp *http.Response) error { contentType := resp.Header.Get("Content-Type") wordContentTypes := []string{ "application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.ms-word", "application/octet-stream", // 有些服务器可能返回这个 } contentType = strings.ToLower(contentType) for _, ct := range wordContentTypes { if strings.Contains(contentType, ct) { return nil } } return errors.New("错误的文件类型") }