diff --git a/go.mod b/go.mod index ac6aa2a..eb351cb 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,7 @@ require ( github.com/gofiber/websocket/v2 v2.2.1 github.com/google/uuid v1.6.0 github.com/google/wire v0.7.0 + github.com/lukasjarosch/go-docx v0.5.0 github.com/ollama/ollama v0.12.7 github.com/redis/go-redis/v9 v9.16.0 github.com/robfig/cron/v3 v3.0.1 diff --git a/go.sum b/go.sum index 1c08769..b069ba2 100644 --- a/go.sum +++ b/go.sum @@ -327,6 +327,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= +github.com/lukasjarosch/go-docx v0.5.0 h1:4vU+gJ4WMdqwRvRVFF+XMw3rPfUGSXlToPJIX3mHQsQ= +github.com/lukasjarosch/go-docx v0.5.0/go.mod h1:ka/NZgDIJId48vMvcfWfduVTY7uV0/f8EgsmCjuS9X0= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= @@ -582,6 +584,7 @@ golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/ golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200925080053-05aa5d4ee321/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= diff --git a/internal/biz/advice.go b/internal/biz/advice.go index 5c2a93a..94ccd71 100644 --- a/internal/biz/advice.go +++ b/internal/biz/advice.go @@ -64,7 +64,7 @@ func (a *AdviceBiz) WordAna(ctx context.Context, wordContent string) error { func (a *AdviceBiz) callLlm(ctx context.Context, prompt string) (string, error) { var message = make([]*model.ChatCompletionMessage, 1) message[0] = &model.ChatCompletionMessage{ - Role: model.ChatMessageRoleSystem, + Role: model.ChatMessageRoleUser, Content: &model.ChatCompletionMessageContent{ StringValue: volcengine.String(prompt), }, diff --git a/internal/biz/llm_service/third_party/hsyq.go b/internal/biz/llm_service/third_party/hsyq.go index 763db1d..5257774 100644 --- a/internal/biz/llm_service/third_party/hsyq.go +++ b/internal/biz/llm_service/third_party/hsyq.go @@ -20,7 +20,7 @@ func NewHsyq() *Hsyq { func (h *Hsyq) getClient(key string) *arkruntime.Client { var client *arkruntime.Client - if _, ok := h.mapClient[key]; !ok { + if _, ok := h.mapClient[key]; ok { client = h.mapClient[key] } else { client = arkruntime.NewClientWithApiKey( @@ -39,6 +39,8 @@ func (h *Hsyq) RequestHsyq(ctx context.Context, key string, modelName string, pr req := model.CreateChatCompletionRequest{ Model: modelName, Messages: prompt, + Stream: new(bool), + Thinking: &model.Thinking{Type: model.ThinkingTypeDisabled}, } resp, err := h.getClient(key).CreateChatCompletion(ctx, req) diff --git a/internal/entitys/advicer.go b/internal/entitys/advicer.go index aaa92f1..46dd76e 100644 --- a/internal/entitys/advicer.go +++ b/internal/entitys/advicer.go @@ -14,7 +14,7 @@ type DialectFeatures struct { } func (e *DialectFeatures) Example() string { - return `{"region":"四川成都话","intensity":0.4,"key_words":["噻","要得","没得","不晓得","是不是","对的嘛","好嘛","晓得嘛","真的","正儿八经","说实话"]}` + return `{"region":"四川成都话","intensity":0.4,"key_words":["噻","要得","没得","不晓得","是不是"]}` } // SentencePatterns 句子模式 @@ -27,14 +27,14 @@ type SentencePatterns struct { } func (e *SentencePatterns) Example() string { - return `{"openingMode":["我是你们的职业顾问","我给你介绍一下","我们先来看一下"],"explanationMode":["是这样的","我跟你讲","因为...所以...","你发现没得"],"confirmationMode":["对吧?","是不是嘛?","你晓得不?","明白了噻?"],"summaryMode":["所以说","总结一下哈","简单说就是"],"transitionMode":["然后的话","再其次","除此之外","还有一点"]}` + return `{"openingMode":["我给你介绍一下","我们先来看一下"],"explanationMode":["是这样的","我跟你讲","你发现没得"],"confirmationMode":["对吧?","是不是嘛?","你晓得不?","明白了噻?"],"summaryMode":["所以说","简单说就是"],"transitionMode":["然后的话","再其次","还有一点"]}` } // PersonalityTags 个性标签 type PersonalityTags []string func (e *PersonalityTags) Example() string { - return `["耐心细致","本地通","数据控","关系型","细节控","乐观积极","诚实可信"]` + return `["耐心细致","细节控"]` } // ToneTags 语气标签 @@ -57,7 +57,7 @@ type SignatureDialogues struct { } func (e *SignatureDialogues) Example() string { - return `[{"context":"客户质疑地块大小","dialogue":"哥,14亩确实不大,但你要在成都是2.5环内城买房,这种是个普遍存在的一个现象。你看万景和绿城都是13亩,中铁建只有8.8亩,339那个帮泰只有11亩。我们虽然地小,但楼间距开阔啊,看过去都是200多米!"},{"context":"客户担心物业费高","dialogue":"姐,我懂你意思,我们也觉得物业费是有点贵。但招商物业是铂金服务,有管家送外卖、免费宠物喂养这些增值服务。你算一下,就算贵一块钱,十年也就多14000,但好物业让房子增值不止这点!"},{"context":"客户犹豫价格","dialogue":"说实话,这个地段的地价都比28板块贵5000多,但我们单价只贵3000。你看龙湖滨江云河颂套内单价都36000了,我们才33000,真的性价比高!现在不买,以后这个板块可能就买不起了。"},{"context":"介绍项目优势","dialogue":"我跟你讲,我们项目就三个核心:地段在2.5环内槐树店板块,产品是全玻璃幕墙+三层中空玻璃,价格是板块最低门槛。花400多万买这里,真的是可上可下!"},{"context":"处理客户异议","dialogue":"我懂你的担心,很多客户刚开始也这样想。但你看嘛,我们旁边那块48亩地还没拍,以后肯定也是大品牌开发商来做豪宅。到时候我们这边全是高端盘,价格只会往上走!"}]` + return `[{"context":"客户质疑地块大小","dialogue":"哥,14亩确实不大,但你要在成都是2.5环内城买房,这种是个普遍存在的一个现象。你看万景和绿城都是13亩,中铁建只有8.8亩,339那个帮泰只有11亩。我们虽然地小,但楼间距开阔啊,看过去都是200多米!"},{"context":"客户担心物业费高","dialogue":"姐,我懂你意思,我们也觉得物业费是有点贵。但招商物业是铂金服务,有管家送外卖、免费宠物喂养这些增值服务。你算一下,就算贵一块钱,十年也就多14000,但好物业让房子增值不止这点!"},{"context":"客户犹豫价格","dialogue":"说实话,这个地段的地价都比28板块贵5000多,但我们单价只贵3000。你看龙湖滨江云河颂套内单价都36000了,我们才33000,真的性价比高!现在不买,以后这个板块可能就买不起了。"}]` } // -------项目 @@ -73,21 +73,21 @@ func (e *RegionValue) Example() string { type CompetitionComparison map[string]string func (e *CompetitionComparison) Example() string { - return `{"龙湖滨江云河颂":{"优点承认":"龙湖位置确实好,看沙河公园","价格对比":"他们单价32000-35000,但得房率只有95%,套内算下来36000+","优势突出":"我们得房率118平实得132平,套内单价才33000"},"邦泰云锦":{"定位相似":"邦泰也是首个项目,要打造口碑","价格参考":"他们当时12800拿地,现在卖34000","品质对比":"我们外立面全玻璃幕墙,比他们成本高30%"},"招商景程序":{"板块差异":"他们在28板块,我们是槐树店板块","地价对比":"他们地价13000左右,我们19500","价值判断":"地价比我们便宜6000,但单价差不多,我们更值"},"万景/绿城":{"地块对比":"他们都是13-15亩,我们14亩差不多","楼间距":"他们楼间距只有30米,我们有200多米","产品力":"我们全屋木饰面交付,他们很多是简装"}}` + return `{"龙湖滨江云河颂":{"优点承认":"龙湖位置确实好,看沙河公园","价格对比":"他们单价32000-35000,但得房率只有95%,套内算下来36000+","优势突出":"我们得房率118平实得132平,套内单价才33000"},"邦泰云锦":{"定位相似":"邦泰也是首个项目,要打造口碑","价格参考":"他们当时12800拿地,现在卖34000","品质对比":"我们外立面全玻璃幕墙,比他们成本高30%"}}` } // CoreSellingPoints 竞品对比话术 type CoreSellingPoints map[string]string func (e *CoreSellingPoints) Example() string { - return `{"规划设计":{"楼间距":"一号楼看公园280米,二号楼看邦泰190米,三号楼间距71米","布局":"L型布局,最大化利用公园景观","容积率":"2.0,在主城区非常低","绿化率":"35%,加公园绿化实际超过50%"},"建筑品质":{"外立面":"全玻璃幕墙+铝单板,浅金色铝板","玻璃":"三层中空氩气玻璃(非双层),成本高一倍","层高":"3.2米层高,豪宅标准(很多盘只有3.05米)","架空层":"6米挑高,全架空设计"},"户型产品":{"118户型":"118平做四房三卫,成都唯一,实得132平","140户型":"270度全景舱,两面采光","得房率":"赠送多,实得率高","功能设计":"动静分区,双套房设计"},"装修标准":{"三大件":"菲斯曼地暖,日立中央空调,霍尼韦尔新风","厨房":"方太Y9烟灶,20套洗碗机,安吉尔净水器","卫浴":"高仪全套,无缝拼接台盆","地面":"简一大理石瓷砖,无缝拼接","墙面":"全屋木饰面,部分硬质软包"}}` + return `{"龙湖滨江云河颂":{"优点承认":"龙湖位置确实好,看沙河公园","价格对比":"他们单价32000-35000,但得房率只有95%,套内算下来36000+","优势突出":"我们得房率118平实得132平,套内单价才33000"},"邦泰云锦":{"定位相似":"邦泰也是首个项目,要打造口碑","价格参考":"他们当时12800拿地,现在卖34000","品质对比":"我们外立面全玻璃幕墙,比他们成本高30%"}` } // SupportingFacilities 配套体系 type SupportingFacilities map[string]string func (e *SupportingFacilities) Example() string { - return `{"交通配套":{"地铁":"双店路站350米(7号线),槐树店站550米(4号线),未来12号线","道路":"中环路、成洛大道,到春熙路5个站","通达性":"到火车东站2个站,到华西30分钟内"},"商业配套":{"高端商圈":"万象城1.6公里,三板桥1.8公里","社区商业":"成华奥园广场、十里风荷底商","未来商业":"上东里商业(明年开业,有永辉超市)"},"生态配套":{"公园体系":"楼下40亩槐树店公园,400米沙河公园,700米多宝寺公园","绿道系统":"沿沙河跑到塔山公园20多公里","环境优势":"2.5环内唯一有300亩生态带的楼盘"},"教育配套":{"幼儿园":"楼下公立幼儿园(明年9月招生)","小学":"城市附小锦汇东城(成华区生源最好的学校)","生源优势":"周边新盘都是300万+,生源纯粹"},"医疗配套":{"三甲医院":"市六医院、市二医院3公里内","顶尖医疗":"华西医院锦江院区30分钟车程","便利性":"到华西本部也是30分钟内"}}` + return `{"交通配套":{"地铁":"双店路站350米(7号线),槐树店站550米(4号线),未来12号线","道路":"中环路、成洛大道,到春熙路5个站","通达性":"到火车东站2个站,到华西30分钟内"},"教育配套":{"幼儿园":"楼下公立幼儿园","小学":"城市附小锦汇东城(成华区生源最好的学校)","生源优势":"周边新盘都是300万+,生源纯粹"},"医疗配套":{"三甲医院":"市六医院、市二医院3公里内","顶尖医疗":"华西医院锦江院区30分钟车程","便利性":"到华西本部也是30分钟内"}}` } // DeveloperBacking 开发商背书 @@ -103,33 +103,33 @@ func (e *DeveloperBacking) Example() string { type NeedsMining map[string]string func (e *NeedsMining) Example() string { - return `{"预算需求":["你们总价想控制在多少以内?","是考虑按揭还是一次性?","月供能接受多少范围?"],"居住需求":["几个人住?有老人小孩吗?","主要是自住还是考虑投资?","现在住哪里?想改善哪些方面?"],"通勤需求":["在哪个位置上班?","主要开车还是坐地铁?","对地铁距离有要求吗?"],"偏好需求":["更看重地段还是产品本身?","喜欢安静的还是热闹的?","对楼层、朝向有偏好吗?"],"时间需求":["打算什么时候入住?","现在看到什么阶段了?","决策需要跟家人商量吗?"]}` + return `{"预算需求":["你们总价想控制在多少以内?","是考虑按揭还是一次性?","月供能接受多少范围?"],"居住需求":["几个人住?有老人小孩吗?","主要是自住还是考虑投资?","现在住哪里?想改善哪些方面?"],"通勤需求":["在哪个位置上班?","主要开车还是坐地铁?","对地铁距离有要求吗?"]}` } // PainPointResponse 痛点应对策略 type PainPointResponse map[string]string func (e *PainPointResponse) Example() string { - return `{"地块太小":{"承认事实":"14亩确实不大","普遍现象":"2.5环内都是小地块,万景13亩,中铁建8.8亩","转化优势":"但人少安静,楼间距反而更开阔","对比竞品":"339的邦泰才11亩,人家上千万豪宅"},"物业费高":{"理解感受":"我懂你,我们也觉得有点贵","价值分析":"但6块里3块是增值服务(保洁、送外卖、宠物喂养)","价格补贴":"前三年补贴到5块,跟其他盘差不多","保值论证":"好物业让房子增值,紫东星座就是例子"},"开发商不知名":{"坦诚相告":"没听说过很正常,我们是成都首个项目","实力展示":"但中信主业是矿产,房地产只占20%,资金安全","案例对比":"邦泰、奥兰刚开始也没人知道,现在都认可了","品质承诺":"首个项目更要做好口碑,不计成本打造"},"周边有社区用地":{"明确规划":"那边是社区服务中心,最多三层楼","距离保证":"离我们有100米,不影响采光","未来价值":"社区配套齐全,生活更方便","对比安慰":"总比修高楼挡光好嘛"},"价格偏高":{"地段价值":"地段值这个价,2.5环内没得选","产品价值":"全玻璃幕墙、3.2米层高,成本就高","比较价值":"比龙湖、邦泰单价都低","门槛价值":"板块最低总价,以后更买不起"}}` + return `{"地块太小":{"承认事实":"14亩确实不大","普遍现象":"2.5环内都是小地块,万景13亩,中铁建8.8亩","转化优势":"但人少安静,楼间距反而更开阔","对比竞品":"339的邦泰才11亩,人家上千万豪宅"},"物业费高":{"理解感受":"我懂你,我们也觉得有点贵","价值分析":"但6块里3块是增值服务(保洁、送外卖)","价格补贴":"前三年补贴到5块,跟其他盘差不多"}}` } // ValueBuilding 价值塑造技巧 type ValueBuilding map[string]string func (e *ValueBuilding) Example() string { - return `{"地段价值塑造":["买房最重要的是地段、地段、还是地段","核心地段的核心资产才保值增值","2.5环内的地卖一块少一块,不可再生"],"产品价值塑造":["我们是用改善的价格,买豪宅的标准","很多细节都是3000万豪宅才有的配置","外立面成本比竞品高30%,但单价差不多"],"稀缺性塑造":["整个槐树店板块,我们是唯一有400万以下产品的","118四房三卫,全成都找不出第二个","200多米楼间距,主城区绝版"],"未来价值塑造":["现在垫垫脚买了,未来换房可上可下","旁边48亩地以后拍出来,肯定刷新地价","板块全部交付后,城市界面会完全不一样"]}` + return `{"地段价值塑造":["买房最重要的是地段、地段、还是地段","核心地段的核心资产才保值增值","2.5环内的地卖一块少一块,不可再生"],"产品价值塑造":["我们是用改善的价格,买豪宅的标准","很多细节都是3000万豪宅才有的配置","外立面成本比竞品高30%,但单价差不多"]}` } // ClosingTechniques 促单话术 type ClosingTechniques map[string]string func (e *ClosingTechniques) Example() string { - return `{"紧迫感营造":{"时间紧迫":["今天是月底最后一天,领导有压力价格可谈","我们刚刚开盘,还有额外优惠","月底冲业绩,价格最有弹性"],"房源稀缺":["118只剩20多套了,好楼层不多","这栋楼就60户,卖一套少一套","特价房只有这几套,今天不定可能就没了"]},"优惠策略":{"价格优惠":["今天定的话,我可以跟领导申请额外折扣","买车位的话,总价多给两个点优惠","一次性付款再优惠一个点"],"附加价值":["送一年物业费","送品牌家电礼包","优先选车位"]},"决策推动":{"小步推进":["要不先交个小定保留房源?","可以先排个号,有优惠优先通知你","今天不定的话,我帮你留意好楼层"],"风险规避":["现在不定,下次来可能就不是这个价了","好楼层不等人,上次有客户犹豫一天就没了","月底优惠政策可能明天就没了"]},"成交确认":{"二选一":["您是选902还是1002?","倾向118还是140?","喜欢东边户还是西边户?"],"假设成交":["那我给您准备合同了","车位您选哪个位置?","按揭资料我带您去准备"]}}` + return `{"紧迫感营造":{"时间紧迫":["今天是月底最后一天,领导有压力价格可谈","我们刚刚开盘,还有额外优惠","月底冲业绩,价格最有弹性"],"房源稀缺":["118只剩20多套了,好楼层不多","这栋楼就60户,卖一套少一套","特价房只有这几套,今天不定可能就没了"]},"优惠策略":{"价格优惠":["今天定的话,我可以跟领导申请额外折扣","买车位的话,总价多给两个点优惠","一次性付款再优惠一个点"],"附加价值":["送一年物业费","送品牌家电礼包","优先选车位"]},"决策推动":{"小步推进":["要不先交个小定保留房源?","可以先排个号,有优惠优先通知你","今天不定的话,我帮你留意好楼层"]}}` } // CommunicationRhythm 沟通节奏控制 type CommunicationRhythm map[string]string func (e *CommunicationRhythm) Example() string { - return `{"开场阶段":{"时间占比":"5%","目标":"建立关系,了解需求","关键动作":"亲切称呼,简单寒暄,确认看房重点"},"沙盘讲解":{"时间占比":"30%","目标":"建立价值认知","关键动作":"板块价值→周边配套→项目亮点→开发商介绍"},"样板间体验":{"时间占比":"35%","目标":"激发购买欲望","关键动作":"让客户亲自体验(开关窗、看视野、感受层高)"},"价格谈判":{"时间占比":"25%","目标":"促成决策","关键动作":"算价→解释优惠→对比竞品→促单"},"结束跟进":{"时间占比":"5%","目标":"建立长期联系","关键动作":"送资料,加微信,约定下次,小礼物"}}` + return `{"开场阶段":{"时间占比":"5%","目标":"建立关系,了解需求","关键动作":"亲切称呼,简单寒暄,确认看房重点"},"沙盘讲解":{"时间占比":"30%","目标":"建立价值认知","关键动作":"板块价值→周边配套→项目亮点→开发商介绍"}}` } diff --git a/internal/pkg/file_download/file_download.go b/internal/pkg/file_download/file_download.go index 26fc5c2..70429be 100644 --- a/internal/pkg/file_download/file_download.go +++ b/internal/pkg/file_download/file_download.go @@ -1,16 +1,16 @@ package file_download import ( + "archive/zip" "bytes" + "encoding/xml" "errors" "fmt" "io" - "github.com/unidoc/unioffice/document" - "net/http" "net/url" - "os" + "path/filepath" "strings" "time" @@ -118,112 +118,102 @@ func GetWordTextFromURL(url string, validFunc func(resp *http.Response) error) ( // 解析Word内容 - 简单版本,只提取文字 func parseWordContent(data []byte) (string, error) { - // 创建reader reader := bytes.NewReader(data) - - // 打开Word文档 - doc, err := document.Read(reader, int64(len(data))) + zipReader, err := zip.NewReader(reader, int64(len(data))) if err != nil { - return "", err + return "", fmt.Errorf("解压docx失败: %v", err) } - defer doc.Close() - // 提取所有文字 var textBuilder strings.Builder - // 遍历所有段落 - for _, paragraph := range doc.Paragraphs() { - // 遍历段落中的所有文本块 - for _, run := range paragraph.Runs() { - textBuilder.WriteString(run.Text()) + // 遍历 ZIP 文件中的文件 + for _, file := range zipReader.File { + // 只处理文档主体文件 + if file.Name == "word/document.xml" { + rc, err := file.Open() + if err != nil { + return "", fmt.Errorf("打开文档文件失败: %v", err) + } + defer rc.Close() + + // 读取 XML 内容 + xmlData, err := io.ReadAll(rc) + if err != nil { + return "", fmt.Errorf("读取XML失败: %v", err) + } + + // 提取文本 + text, err := parseWordXML(xmlData) + if err != nil { + return "", fmt.Errorf("解析XML失败: %v", err) + } + + textBuilder.WriteString(text) + break // 找到主文档后退出循环 + } + } + + return textBuilder.String(), nil +} + +// 解析 Word XML 文档 +func parseWordXML(xmlData []byte) (string, error) { + type WordDocument struct { + XMLName xml.Name `xml:"document"` + Body struct { + Paragraphs []struct { + Runs []struct { + Text string `xml:"t"` + } `xml:"r"` + } `xml:"p"` + } `xml:"body"` + } + + var doc WordDocument + if err := xml.Unmarshal(xmlData, &doc); err != nil { + // 尝试简化解析 + return extractTextSimple(xmlData), nil + } + + var textBuilder strings.Builder + for _, para := range doc.Body.Paragraphs { + for _, run := range para.Runs { + textBuilder.WriteString(run.Text) } - // 每个段落后面加换行 textBuilder.WriteString("\n") } - // 返回清理过的文本 - result := strings.TrimSpace(textBuilder.String()) - return result, nil + return textBuilder.String(), nil } -// 解析 Word 文件内容 -func parseWordFile(filePath string) (map[string]interface{}, error) { - ext := strings.ToLower(filepath.Ext(filePath)) +// 简化文本提取(处理更复杂的文档结构) +func extractTextSimple(xmlData []byte) string { + var textBuilder strings.Builder - result := map[string]interface{}{ - "filepath": filePath, - "format": ext, - } - - // 解析 .docx 文件 - if ext == ".docx" { - doc, err := document.Open(filePath) + // 简单提取 标签内容 + decoder := xml.NewDecoder(bytes.NewReader(xmlData)) + for { + token, err := decoder.Token() + if err == io.EOF { + break + } if err != nil { - return nil, err - } - defer doc.Close() - - // 提取段落文本 - var paragraphs []string - for _, p := range doc.Paragraphs() { - text := "" - for _, run := range p.Runs() { - text += run.Text() - } - if strings.TrimSpace(text) != "" { - paragraphs = append(paragraphs, text) - } + continue } - // 提取表格内容 - var tables []map[string]interface{} - for _, table := range doc.Tables() { - tableData := make(map[string]interface{}) - var rows []map[int]string - - for _, row := range table.Rows() { - rowData := make(map[int]string) - for cellIdx, cell := range row.Cells() { - cellText := "" - for _, p := range cell.Paragraphs() { - for _, run := range p.Runs() { - cellText += run.Text() - } + if startElem, ok := token.(xml.StartElement); ok { + if startElem.Name.Local == "t" { + // 读取文本内容 + if nextToken, err := decoder.Token(); err == nil { + if charData, ok := nextToken.(xml.CharData); ok { + textBuilder.WriteString(string(charData)) } - rowData[cellIdx] = cellText } - rows = append(rows, rowData) } - - tableData["rows"] = rows - tableData["row_count"] = len(rows) - tables = append(tables, tableData) } - - result["paragraphs"] = paragraphs - result["tables"] = tables - result["paragraph_count"] = len(paragraphs) - result["table_count"] = len(tables) - - } else if ext == ".doc" { - // 对于 .doc 文件,可能需要其他库或转换 - // 这里简单读取为二进制文件 - data, err := os.ReadFile(filePath) - if err != nil { - return nil, err - } - result["binary_size"] = len(data) - result["note"] = ".doc 文件需要专门的解析库" } - // 获取文件信息 - fileInfo, _ := os.Stat(filePath) - if fileInfo != nil { - result["filesize"] = fileInfo.Size() - result["modified"] = fileInfo.ModTime() - } - - return result, nil + return textBuilder.String() } // 判断是否为 Word 文件