Gibberish when writing news to html
Hello,
I tried to download daily oil Chinese news to html file.The code is as follows:
#OIL新闻下载
def download_headlines(query,S_date,E_date):
headlines = pd.DataFrame()
try:
while E_date > S_date:
headline = ek.get_news_headlines(query,date_to=E_date,count=100)
headlines = headlines.append(headline)
t = headline.iat[len(headline)-1,0]
E_date = t.strftime("%Y-%m-%dT%H:%M:%S.%f")
except:
print('出错了...')
finally:
headlines = headlines.drop_duplicates()
headlines = headlines.drop(headlines[headlines.versionCreated < S_date].index)
print('下载新闻标题总数:%d' % len(headlines))
return headlines
query_oil = '( Topic:CRU OR Topic:PROD ) AND Source:RTRS NOT ( Product:RITV OR Product:RCNBC OR Product:LCNBC ) in LZS'
End_date = datetime.datetime.now()
Start_date = End_date - timedelta(days=1)
headlines_oil = download_headlines(query_oil,str(Start_date),str(End_date))
headlines = headlines_oil
#下载全部新闻内容
headlines['story'] = [ek.get_news_story(each) for each in headlines['storyId'].tolist()]
#正则化
import re
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"</p>|<p>|</div>",'',x)) # 删除段落符
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"<br/><br/>",'<br/>',x)) # 删除段落之间的过多空行
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"<br/>",'<br/><br/>',x)) # 使段落之间强制空一行
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|(]完(.*?)$",'【完】',x)) # 修改 (完) -> 【完】
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|(]c(.*?)Copyright(.*?)$",'',x)) # 删除“(c) Copyright”之后的内容
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|(]([编译|发稿|整理|审校|Reporting|Editting])(.*?)$",'',x)) # 删除“(编译|发稿|Reporting”之后的内容
#转化为html文件
os.chdir(r'D:/TASK/Daily')
f = open('daily.html','w',encoding='utf-8') # 创建 news_archive.html
f = open('daily.html','a',encoding='utf-8') # 设置追加写入模式
f.writelines('目录<hr>')
table_of_content=headlines.text.tolist()
for each in table_of_content:
f.writelines(each + '<br/>')
f.writelines('<hr><br/>')
for i in range(len(headlines)):
f.writelines('<font color="#FF0000">'+ headlines.iat[i,1]+'</font><br/><br/>') #写入新闻标题,红色,空一行
f.write(headlines.iat[i,4]+'<hr><br/>') #写入新闻全文,文后加横线分隔
f.close()
However,though I've input encoding = 'utf-8' when writing html file,the output are still garbled words.Can you help me?
Thank you
Best Answer
-
Hi @YanHan
I just test this code and asked my colleague to confirm that the result are the same (no gibberish in the story news from API).
News from Eikon "News Monitor" app:
News from API:
news_id = 'urn:newsml:reuters.com:20210303:nL3S2L10QI:3'
df = ek.get_news_story(news_id)
from IPython.core.display import HTML
HTML(df)0
Answers
-
I tested your codes on my jupyter notebook running on my windows 10 but did not find the garbled words like your screenshot. You can find the sample output from the attached daily.zip file.
And below are the codes I have modified a little bit.
#OIL新闻下载
import pandas as pd
def download_headlines(query,S_date,E_date):
headlines = pd.DataFrame()
try:
while E_date > S_date:
headline = ek.get_news_headlines(query,date_to=E_date,count=100)
headlines = headlines.append(headline)
t = headline.iat[len(headline)-1,0]
E_date = t.strftime("%Y-%m-%dT%H:%M:%S.%f")
except:
print('出错了...')
finally:
headlines = headlines.drop_duplicates()
headlines = headlines.drop(headlines[headlines.versionCreated < S_date].index)
print('下载新闻标题总数:%d' % len(headlines))
return headlines
import datetime
query_oil = '( Topic:CRU OR Topic:PROD ) AND Source:RTRS NOT ( Product:RITV OR Product:RCNBC OR Product:LCNBC ) in LZS'
End_date = datetime.datetime.now()
Start_date = End_date - datetime.timedelta(days=1)
headlines_oil = download_headlines(query_oil,str(Start_date),str(End_date))
headlines = headlines_oil
#下载全部新闻内容
headlines['story'] = [ek.get_news_story(each) for each in headlines['storyId'].tolist()]
#正则化
import re
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"</p>|<p>|</div>",'',x)) # 删除段落符
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"<br/><br/>",'<br/>',x)) # 删除段落之间的过多空行
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"<br/>",'<br/><br/>',x)) # 使段落之间强制空一行
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|(]完(.*?)$",'【完】',x)) # 修改 (完) -> 【完】
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|(]c(.*?)Copyright(.*?)$",'',x)) # 删除“(c) Copyright”之后的内容
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|(]([编译|发稿|整理|审校|Reporting|Editting])(.*?)$",'',x)) # 删除“(编译|发稿|Reporting”之后的内容
#转化为html文件
import os
os.chdir(r'c:\\tmp')
#f = open('daily.html','w',encoding='utf-8') # 创建 news_archive.html
f = open('daily.html','a+',encoding='utf-8') # 设置追加写入模式
f.writelines('目录<hr>')
table_of_content=headlines.text.tolist()
for each in table_of_content:
f.writelines(each + '<br/>')
f.writelines('<hr><br/>')
for i in range(len(headlines)):
f.writelines('<font color="#FF0000">'+ headlines.iat[i,1]+'</font><br/><br/>') #写入新闻标题,红色,空一行
f.write(headlines.iat[i,4]+'<hr><br/>') #写入新闻全文,文后加横线分隔
f.close()1 -
Dissertation, like essay, is a demanding assignment. However, students frequently fail to write in a grammatically proper manner. I found a solution and went to https://www.bestessay.com/ , where I acquire my papers and always get them on time and in good shape, regardless of the topic's intricacy or my specifications. I encourage students to give it a shot!
0
Categories
- All Categories
- 6 AHS
- 37 Alpha
- 161 App Studio
- 4 Block Chain
- 4 Bot Platform
- 16 Connected Risk APIs
- 47 Data Fusion
- 30 Data Model Discovery
- 608 Datastream
- 1.3K DSS
- 577 Eikon COM
- 4.9K Eikon Data APIs
- 7 Electronic Trading
- Generic FIX
- 7 Local Bank Node API
- Trading API
- 2.7K Elektron
- 1.3K EMA
- 236 ETA
- 519 WebSocket API
- 33 FX Venues
- 10 FX Market Data
- 1 FX Post Trade
- 1 FX Trading - Matching
- 12 FX Trading – RFQ Maker
- 5 Intelligent Tagging
- 2 Legal One
- 20 Messenger Bot
- 2 Messenger Side by Side
- 9 ONESOURCE
- 7 Indirect Tax
- 59 Open Calais
- 264 Open PermID
- 39 Entity Search
- 2 Org ID
- PAM
- PAM - Logging
- 8.4K Private Comments
- 6 Product Insight
- Project Tracking
- ProView
- ProView Internal
- 20 RDMS
- 1.4K Refinitiv Data Platform
- 367 Refinitiv Data Platform Libraries
- 3 Refinitiv Due Diligence
- LSEG Due Diligence Portal API
- 3 Refinitiv Due Dilligence Centre
- Rose's Space
- 1.1K Screening
- 18 Qual-ID API
- 13 Screening Deployed
- 23 Screening Online
- 10 World-Check Customer Risk Screener
- 990 World-Check One
- 44 World-Check One Zero Footprint
- 45 Side by Side Integration API
- Test Space
- 3 Thomson One Smart
- 1.2K TR Internal
- Global Hackathon 2015
- 2 Specialists Who Code
- 10 TR Knowledge Graph
- 150 Transactions
- 142 REDI API
- 1.7K TREP APIs
- 4 CAT
- 21 DACS Station
- 117 Open DACS
- 1.1K RFA
- 103 UPA
- 172 TREP Infrastructure
- 224 TRKD
- 886 TRTH
- 5 Velocity Analytics
- 5 Wealth Management Web Services
- 59 Workspace SDK
- 9 Element Framework
- 5 Grid
- 13 World-Check Data File
- Yield Book Analytics
- 46 中文论坛