247 lines
5.6 KiB
Go
247 lines
5.6 KiB
Go
package file_download
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
|
|
"github.com/unidoc/unioffice/document"
|
|
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// 下载文件
|
|
func DownloadFile(url string, validFunc func(resp *http.Response) error) ([]byte, string, error) {
|
|
// 设置超时
|
|
client := &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
}
|
|
|
|
// 发送请求
|
|
resp, err := client.Get(url)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
|
|
}
|
|
|
|
if validFunc != nil {
|
|
err = validFunc(resp)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
}
|
|
|
|
// 读取文件数据
|
|
data, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
|
|
// 获取文件名
|
|
filename := getFilenameFromURL(url, resp)
|
|
|
|
return data, filename, nil
|
|
}
|
|
|
|
// 从 URL 或响应头获取文件名
|
|
func getFilenameFromURL(urlStr string, resp *http.Response) string {
|
|
// 1. 尝试从 Content-Disposition 头获取
|
|
contentDisposition := resp.Header.Get("Content-Disposition")
|
|
if contentDisposition != "" {
|
|
if strings.Contains(contentDisposition, "filename=") {
|
|
parts := strings.Split(contentDisposition, "filename=")
|
|
if len(parts) > 1 {
|
|
filename := strings.Trim(parts[1], `"' `)
|
|
return sanitizeFilename(filename)
|
|
}
|
|
}
|
|
}
|
|
|
|
// 2. 从 URL 路径获取
|
|
parsedURL, err := url.Parse(urlStr)
|
|
if err == nil {
|
|
path := parsedURL.Path
|
|
if path != "" {
|
|
filename := filepath.Base(path)
|
|
if filename != "" && filename != "." && filename != "/" {
|
|
return sanitizeFilename(filename)
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. 生成默认文件名
|
|
return fmt.Sprintf("word_%d.docx", time.Now().Unix())
|
|
}
|
|
|
|
// 清理文件名
|
|
func sanitizeFilename(filename string) string {
|
|
// 移除非法字符
|
|
illegalChars := []string{"/", "\\", ":", "*", "?", "\"", "<", ">", "|"}
|
|
for _, char := range illegalChars {
|
|
filename = strings.ReplaceAll(filename, char, "_")
|
|
}
|
|
|
|
// 确保有扩展名
|
|
if !strings.Contains(filename, ".") {
|
|
filename += ".docx"
|
|
}
|
|
|
|
return filename
|
|
}
|
|
|
|
// 从URL获取Word文件的纯文本内容
|
|
func GetWordTextFromURL(url string, validFunc func(resp *http.Response) error) (string, string, error) {
|
|
// 1. 下载文件
|
|
data, fileName, err := DownloadFile(url, validFunc)
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("下载失败: %w", err)
|
|
}
|
|
|
|
// 2. 解析Word文件
|
|
text, err := parseWordContent(data)
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("解析失败: %w", err)
|
|
}
|
|
|
|
return text, fileName, nil
|
|
}
|
|
|
|
// 解析Word内容 - 简单版本,只提取文字
|
|
func parseWordContent(data []byte) (string, error) {
|
|
// 创建reader
|
|
reader := bytes.NewReader(data)
|
|
|
|
// 打开Word文档
|
|
doc, err := document.Read(reader, int64(len(data)))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer doc.Close()
|
|
|
|
// 提取所有文字
|
|
var textBuilder strings.Builder
|
|
|
|
// 遍历所有段落
|
|
for _, paragraph := range doc.Paragraphs() {
|
|
// 遍历段落中的所有文本块
|
|
for _, run := range paragraph.Runs() {
|
|
textBuilder.WriteString(run.Text())
|
|
}
|
|
// 每个段落后面加换行
|
|
textBuilder.WriteString("\n")
|
|
}
|
|
|
|
// 返回清理过的文本
|
|
result := strings.TrimSpace(textBuilder.String())
|
|
return result, nil
|
|
}
|
|
|
|
// 解析 Word 文件内容
|
|
func parseWordFile(filePath string) (map[string]interface{}, error) {
|
|
ext := strings.ToLower(filepath.Ext(filePath))
|
|
|
|
result := map[string]interface{}{
|
|
"filepath": filePath,
|
|
"format": ext,
|
|
}
|
|
|
|
// 解析 .docx 文件
|
|
if ext == ".docx" {
|
|
doc, err := document.Open(filePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer doc.Close()
|
|
|
|
// 提取段落文本
|
|
var paragraphs []string
|
|
for _, p := range doc.Paragraphs() {
|
|
text := ""
|
|
for _, run := range p.Runs() {
|
|
text += run.Text()
|
|
}
|
|
if strings.TrimSpace(text) != "" {
|
|
paragraphs = append(paragraphs, text)
|
|
}
|
|
}
|
|
|
|
// 提取表格内容
|
|
var tables []map[string]interface{}
|
|
for _, table := range doc.Tables() {
|
|
tableData := make(map[string]interface{})
|
|
var rows []map[int]string
|
|
|
|
for _, row := range table.Rows() {
|
|
rowData := make(map[int]string)
|
|
for cellIdx, cell := range row.Cells() {
|
|
cellText := ""
|
|
for _, p := range cell.Paragraphs() {
|
|
for _, run := range p.Runs() {
|
|
cellText += run.Text()
|
|
}
|
|
}
|
|
rowData[cellIdx] = cellText
|
|
}
|
|
rows = append(rows, rowData)
|
|
}
|
|
|
|
tableData["rows"] = rows
|
|
tableData["row_count"] = len(rows)
|
|
tables = append(tables, tableData)
|
|
}
|
|
|
|
result["paragraphs"] = paragraphs
|
|
result["tables"] = tables
|
|
result["paragraph_count"] = len(paragraphs)
|
|
result["table_count"] = len(tables)
|
|
|
|
} else if ext == ".doc" {
|
|
// 对于 .doc 文件,可能需要其他库或转换
|
|
// 这里简单读取为二进制文件
|
|
data, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
result["binary_size"] = len(data)
|
|
result["note"] = ".doc 文件需要专门的解析库"
|
|
}
|
|
|
|
// 获取文件信息
|
|
fileInfo, _ := os.Stat(filePath)
|
|
if fileInfo != nil {
|
|
result["filesize"] = fileInfo.Size()
|
|
result["modified"] = fileInfo.ModTime()
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// 判断是否为 Word 文件
|
|
func IsWordFile(resp *http.Response) error {
|
|
contentType := resp.Header.Get("Content-Type")
|
|
wordContentTypes := []string{
|
|
"application/msword",
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"application/vnd.ms-word",
|
|
"application/octet-stream", // 有些服务器可能返回这个
|
|
}
|
|
|
|
contentType = strings.ToLower(contentType)
|
|
for _, ct := range wordContentTypes {
|
|
if strings.Contains(contentType, ct) {
|
|
return nil
|
|
}
|
|
}
|
|
return errors.New("错误的文件类型")
|
|
}
|