ai_scheduler/internal/pkg/file_download/file_download.go

237 lines
5.3 KiB
Go

package file_download
import (
"archive/zip"
"bytes"
"encoding/xml"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"path/filepath"
"strings"
"time"
)
// 下载文件
func DownloadFile(url string, validFunc func(resp *http.Response) error) ([]byte, string, error) {
// 设置超时
client := &http.Client{
Timeout: 30 * time.Second,
}
// 发送请求
resp, err := client.Get(url)
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
}
if validFunc != nil {
err = validFunc(resp)
if err != nil {
return nil, "", err
}
}
// 读取文件数据
data, err := io.ReadAll(resp.Body)
if err != nil {
return nil, "", err
}
// 获取文件名
filename := getFilenameFromURL(url, resp)
return data, filename, nil
}
// 从 URL 或响应头获取文件名
func getFilenameFromURL(urlStr string, resp *http.Response) string {
// 1. 尝试从 Content-Disposition 头获取
contentDisposition := resp.Header.Get("Content-Disposition")
if contentDisposition != "" {
if strings.Contains(contentDisposition, "filename=") {
parts := strings.Split(contentDisposition, "filename=")
if len(parts) > 1 {
filename := strings.Trim(parts[1], `"' `)
return sanitizeFilename(filename)
}
}
}
// 2. 从 URL 路径获取
parsedURL, err := url.Parse(urlStr)
if err == nil {
path := parsedURL.Path
if path != "" {
filename := filepath.Base(path)
if filename != "" && filename != "." && filename != "/" {
return sanitizeFilename(filename)
}
}
}
// 3. 生成默认文件名
return fmt.Sprintf("word_%d.docx", time.Now().Unix())
}
// 清理文件名
func sanitizeFilename(filename string) string {
// 移除非法字符
illegalChars := []string{"/", "\\", ":", "*", "?", "\"", "<", ">", "|"}
for _, char := range illegalChars {
filename = strings.ReplaceAll(filename, char, "_")
}
// 确保有扩展名
if !strings.Contains(filename, ".") {
filename += ".docx"
}
return filename
}
// 从URL获取Word文件的纯文本内容
func GetWordTextFromURL(url string, validFunc func(resp *http.Response) error) (string, string, error) {
// 1. 下载文件
data, fileName, err := DownloadFile(url, validFunc)
if err != nil {
return "", "", fmt.Errorf("下载失败: %w", err)
}
// 2. 解析Word文件
text, err := parseWordContent(data)
if err != nil {
return "", "", fmt.Errorf("解析失败: %w", err)
}
return text, fileName, nil
}
// 解析Word内容 - 简单版本,只提取文字
func parseWordContent(data []byte) (string, error) {
reader := bytes.NewReader(data)
zipReader, err := zip.NewReader(reader, int64(len(data)))
if err != nil {
return "", fmt.Errorf("解压docx失败: %v", err)
}
var textBuilder strings.Builder
// 遍历 ZIP 文件中的文件
for _, file := range zipReader.File {
// 只处理文档主体文件
if file.Name == "word/document.xml" {
rc, err := file.Open()
if err != nil {
return "", fmt.Errorf("打开文档文件失败: %v", err)
}
defer rc.Close()
// 读取 XML 内容
xmlData, err := io.ReadAll(rc)
if err != nil {
return "", fmt.Errorf("读取XML失败: %v", err)
}
// 提取文本
text, err := parseWordXML(xmlData)
if err != nil {
return "", fmt.Errorf("解析XML失败: %v", err)
}
textBuilder.WriteString(text)
break // 找到主文档后退出循环
}
}
return textBuilder.String(), nil
}
// 解析 Word XML 文档
func parseWordXML(xmlData []byte) (string, error) {
type WordDocument struct {
XMLName xml.Name `xml:"document"`
Body struct {
Paragraphs []struct {
Runs []struct {
Text string `xml:"t"`
} `xml:"r"`
} `xml:"p"`
} `xml:"body"`
}
var doc WordDocument
if err := xml.Unmarshal(xmlData, &doc); err != nil {
// 尝试简化解析
return extractTextSimple(xmlData), nil
}
var textBuilder strings.Builder
for _, para := range doc.Body.Paragraphs {
for _, run := range para.Runs {
textBuilder.WriteString(run.Text)
}
textBuilder.WriteString("\n")
}
return textBuilder.String(), nil
}
// 简化文本提取(处理更复杂的文档结构)
func extractTextSimple(xmlData []byte) string {
var textBuilder strings.Builder
// 简单提取 <w:t> 标签内容
decoder := xml.NewDecoder(bytes.NewReader(xmlData))
for {
token, err := decoder.Token()
if err == io.EOF {
break
}
if err != nil {
continue
}
if startElem, ok := token.(xml.StartElement); ok {
if startElem.Name.Local == "t" {
// 读取文本内容
if nextToken, err := decoder.Token(); err == nil {
if charData, ok := nextToken.(xml.CharData); ok {
textBuilder.WriteString(string(charData))
}
}
}
}
}
return textBuilder.String()
}
// 判断是否为 Word 文件
func IsWordFile(resp *http.Response) error {
contentType := resp.Header.Get("Content-Type")
wordContentTypes := []string{
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-word",
"application/octet-stream", // 有些服务器可能返回这个
}
contentType = strings.ToLower(contentType)
for _, ct := range wordContentTypes {
if strings.Contains(contentType, ct) {
return nil
}
}
return errors.New("错误的文件类型")
}