ai_scheduler/internal/pkg/file_download/file_download.go

247 lines
5.6 KiB
Go

package file_download
import (
"bytes"
"errors"
"fmt"
"io"
"github.com/unidoc/unioffice/document"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
// 下载文件
func DownloadFile(url string, validFunc func(resp *http.Response) error) ([]byte, string, error) {
// 设置超时
client := &http.Client{
Timeout: 30 * time.Second,
}
// 发送请求
resp, err := client.Get(url)
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
}
if validFunc != nil {
err = validFunc(resp)
if err != nil {
return nil, "", err
}
}
// 读取文件数据
data, err := io.ReadAll(resp.Body)
if err != nil {
return nil, "", err
}
// 获取文件名
filename := getFilenameFromURL(url, resp)
return data, filename, nil
}
// 从 URL 或响应头获取文件名
func getFilenameFromURL(urlStr string, resp *http.Response) string {
// 1. 尝试从 Content-Disposition 头获取
contentDisposition := resp.Header.Get("Content-Disposition")
if contentDisposition != "" {
if strings.Contains(contentDisposition, "filename=") {
parts := strings.Split(contentDisposition, "filename=")
if len(parts) > 1 {
filename := strings.Trim(parts[1], `"' `)
return sanitizeFilename(filename)
}
}
}
// 2. 从 URL 路径获取
parsedURL, err := url.Parse(urlStr)
if err == nil {
path := parsedURL.Path
if path != "" {
filename := filepath.Base(path)
if filename != "" && filename != "." && filename != "/" {
return sanitizeFilename(filename)
}
}
}
// 3. 生成默认文件名
return fmt.Sprintf("word_%d.docx", time.Now().Unix())
}
// 清理文件名
func sanitizeFilename(filename string) string {
// 移除非法字符
illegalChars := []string{"/", "\\", ":", "*", "?", "\"", "<", ">", "|"}
for _, char := range illegalChars {
filename = strings.ReplaceAll(filename, char, "_")
}
// 确保有扩展名
if !strings.Contains(filename, ".") {
filename += ".docx"
}
return filename
}
// 从URL获取Word文件的纯文本内容
func GetWordTextFromURL(url string, validFunc func(resp *http.Response) error) (string, string, error) {
// 1. 下载文件
data, fileName, err := DownloadFile(url, validFunc)
if err != nil {
return "", "", fmt.Errorf("下载失败: %w", err)
}
// 2. 解析Word文件
text, err := parseWordContent(data)
if err != nil {
return "", "", fmt.Errorf("解析失败: %w", err)
}
return text, fileName, nil
}
// 解析Word内容 - 简单版本,只提取文字
func parseWordContent(data []byte) (string, error) {
// 创建reader
reader := bytes.NewReader(data)
// 打开Word文档
doc, err := document.Read(reader, int64(len(data)))
if err != nil {
return "", err
}
defer doc.Close()
// 提取所有文字
var textBuilder strings.Builder
// 遍历所有段落
for _, paragraph := range doc.Paragraphs() {
// 遍历段落中的所有文本块
for _, run := range paragraph.Runs() {
textBuilder.WriteString(run.Text())
}
// 每个段落后面加换行
textBuilder.WriteString("\n")
}
// 返回清理过的文本
result := strings.TrimSpace(textBuilder.String())
return result, nil
}
// 解析 Word 文件内容
func parseWordFile(filePath string) (map[string]interface{}, error) {
ext := strings.ToLower(filepath.Ext(filePath))
result := map[string]interface{}{
"filepath": filePath,
"format": ext,
}
// 解析 .docx 文件
if ext == ".docx" {
doc, err := document.Open(filePath)
if err != nil {
return nil, err
}
defer doc.Close()
// 提取段落文本
var paragraphs []string
for _, p := range doc.Paragraphs() {
text := ""
for _, run := range p.Runs() {
text += run.Text()
}
if strings.TrimSpace(text) != "" {
paragraphs = append(paragraphs, text)
}
}
// 提取表格内容
var tables []map[string]interface{}
for _, table := range doc.Tables() {
tableData := make(map[string]interface{})
var rows []map[int]string
for _, row := range table.Rows() {
rowData := make(map[int]string)
for cellIdx, cell := range row.Cells() {
cellText := ""
for _, p := range cell.Paragraphs() {
for _, run := range p.Runs() {
cellText += run.Text()
}
}
rowData[cellIdx] = cellText
}
rows = append(rows, rowData)
}
tableData["rows"] = rows
tableData["row_count"] = len(rows)
tables = append(tables, tableData)
}
result["paragraphs"] = paragraphs
result["tables"] = tables
result["paragraph_count"] = len(paragraphs)
result["table_count"] = len(tables)
} else if ext == ".doc" {
// 对于 .doc 文件,可能需要其他库或转换
// 这里简单读取为二进制文件
data, err := os.ReadFile(filePath)
if err != nil {
return nil, err
}
result["binary_size"] = len(data)
result["note"] = ".doc 文件需要专门的解析库"
}
// 获取文件信息
fileInfo, _ := os.Stat(filePath)
if fileInfo != nil {
result["filesize"] = fileInfo.Size()
result["modified"] = fileInfo.ModTime()
}
return result, nil
}
// 判断是否为 Word 文件
func IsWordFile(resp *http.Response) error {
contentType := resp.Header.Get("Content-Type")
wordContentTypes := []string{
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-word",
"application/octet-stream", // 有些服务器可能返回这个
}
contentType = strings.ToLower(contentType)
for _, ct := range wordContentTypes {
if strings.Contains(contentType, ct) {
return nil
}
}
return errors.New("错误的文件类型")
}