2024-02-11发表2024-02-13更新About XJH / 明慵34 分钟读完 (大约5057个字)

QQ聊天记录分析 QQMsgAnalysis

从PC端QQ中以txt格式导出聊天记录，存为message.txt。

需要安装的库: numpy, seaborn, pandas, wordcloud, tdqm, paddlepaddle, paddlenlp

准备阶段

引入包

pandas: 基础数据框架
matplotlib & seaborn: 主要绘图框架
jieba: 中文分词
wordcloud: 词云
paddlenlp: 情感分析

import re
import time
import numpy as np
import pandas as pd
import jieba
import jieba.posseg as pseg
from PIL import Image
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.ticker as mticker
import matplotlib.transforms as mtransforms
from matplotlib.colors import ListedColormap
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm
from tqdm import tqdm
from paddlenlp import Taskflow
%matplotlib inline

数据预处理

import csv
from datetime import datetime

def convert_messages(input_file, output_file):
    # 定义CSV文件头部
    header = ["LocalId", "TalkerId", "Type", "SubType", "IsSender", "CreateTime", "Status",
              "StrContent", "StrTime", "Remark", "NickName", "Sender"]

    # 打开输入和输出文件
    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        csv_writer = csv.writer(outfile)
        csv_writer.writerow(header)  # 写入头部

        # 用集合存储姓名
        names = set()

        # 逐行读取txt文件内容并转换为csv格式
        with open(input_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            for i in range(0, len(lines), 3):
                timestamp_and_name = lines[i].strip().split(' ', 2)
                date_time = timestamp_and_name[0] + ' ' + timestamp_and_name[1]  # 日期时间
                name = timestamp_and_name[2]  # 姓名（或用户名）

                # 根据姓名判断是否在指定范围内，如果不在，则跳过该条消息
                if name not in ["王婧怡(清辉)", "成王败寇。", "皇后", "王婧怡", "世界之巅", "2558749399"]:
                    continue

                message = lines[i + 1].strip()  # 消息内容

                # 如果消息内容为空，则跳过该条消息
                if not message:
                    continue

                # 根据消息内容设置Type字段的值
                if "[图片]" in message:
                    type_value = 3
                elif "[表情]" in message:
                    type_value = 47
                else:
                    type_value = 1  # 默认为1

                # 根据姓名设置IsSender字段的值
                if name in ["王婧怡(清辉)", "成王败寇。", "皇后", "王婧怡"]:
                    is_sender = 0
                elif name in ["世界之巅", "2558749399"]:
                    is_sender = 1

                # 将姓名添加到集合中，如果之前没有出现过的话
                if name not in names:
                    # print(name)
                    names.add(name)
                
                # 自动生成LocalId
                local_id = i // 3 + 1
                
                # 构建CSV行
                csv_row = [local_id, 1, type_value, 0, is_sender, i, '',
                           message, date_time, '', '', '']
                csv_writer.writerow(csv_row)

    print(f"转换完成，结果保存在 {output_file}")

# 使用示例
# convert_messages("message.txt", "messages.csv")

绘图设置

sns.set_theme(style="ticks")
font = "simsun.ttc"
fp = fm.FontProperties(fname=font)
plt.rcParams["axes.unicode_minus"] = False

人名标签

1	labels = ["WJY", "XJH"]

数据读取

filePath: 消息记录文件的路径
dStart: 开始的时间
dEnd: 结束的时间
tZone: 时区，例如北京时间为 +8

filePath = "messages.csv"
dStart = "2022-08-07 00:00:00 +0800"
dEnd = "2024-02-10 23:59:59 +0800"
tZone = 8

df = pd.read_csv(filePath, encoding="utf-8")

df.loc[:, "StrTime"] = pd.to_datetime(df["StrTime"])
df.loc[:, "day"] = pd.to_datetime(df["StrTime"]).dt.dayofweek
df.loc[:, "hour"] = pd.to_datetime(df["StrTime"]).dt.hour
df.loc[:, "Count"] = 1

dfs = [df.query("IsSender == 0"), df.query("IsSender == 1")]

消息过滤

def textFilter(text: str):
    text = text.lower()
    # try:
    #     co = re.compile("[\U00010000-\U0010ffff]")
    # except re.error:
    #     co = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
    # text = co.sub(" ", text)
    co = re.compile("\[[\u4e00-\u9fa5]+\]")
    return co.sub(" ", text)

获取文本消息

texts = [
    [textFilter(i) for i in dfs[0].query("Type == 1")["StrContent"].to_list()],
    [textFilter(i) for i in dfs[1].query("Type == 1")["StrContent"].to_list()],
]

消息频率分析

类型分析

根据消息的类型进行分类，可以看出喜欢发送的消息类型，同时也可以看出谁发的多

1 = Text
3 = Image
34 = Voice
43 = Video
47 = Sticker
48 = Location
10000 = System

data = {}
for i in range(2):
    data[labels[i]] = [
        len(dfs[i].query("Type == 1")),
        len(dfs[i].query("Type == 3")),
        len(dfs[i].query("Type == 34")),
        len(dfs[i].query("Type == 43")),
        len(dfs[i].query("Type == 47")),
    ]

data = (
    pd.DataFrame(data, index=["Text", "Image", "Voice", "Video", "Sticker"])
    .reset_index()
    .melt("index")
    .rename(columns={"index": "Type", "variable": "Person", "value": "Count"})
)
g = sns.catplot(data, kind="bar", x="Type", y="Count", hue="Person", palette="dark", alpha=0.6, height=6)

for ax in g.axes.ravel():
    for i in range(2):
        ax.bar_label(ax.containers[i], fontsize=9)
sns.move_legend(g, "upper right")
plt.yscale("log")

g.figure.set_size_inches(6, 5)
g.figure.set_dpi(150)
plt.show()
plt.close()

png

消息长度分析

sN: 设置显示范围：
$$
\mu + \mathrm{sN} * \sigma
$$
multiple: 直方图堆叠格式

1 2	sN = 3 multiple = "dodge"

mu, std = 0, 0
data = {"Length": [], "Person": []}
for i in range(2):
    length = [len(textFilter(i)) for i in texts[i]]
    data["Length"] += length
    data["Person"] += [labels[i]] * len(length)
    if np.mean(length) + sN * np.std(length) > mu + std:
        mu, std = np.mean(length), np.std(length)
xlim = int(np.ceil(mu + sN * std))

data = pd.DataFrame(data)
bins = np.linspace(0, xlim, xlim + 1)

ax = sns.histplot(
    data=data,
    x="Length",
    hue="Person",
    bins=bins,
    multiple=multiple,
    edgecolor=".3",
    linewidth=0.5,
    palette="dark",
    alpha=0.6,
)
ax.set_xlim(0, xlim)
ax.set_xlabel("Length of Message")

ax.figure.set_size_inches(8, 4)
ax.figure.set_dpi(150)
plt.show()
plt.close()

png

每日活跃分析

划分每日24小时内每小时发送的消息数，可以得知每天的活跃的时间段

data = {"Time": [], "Person": []}
for i in range(2):
    hour = dfs[i]["hour"].to_list()
    data["Time"] += hour
    data["Person"] += [labels[i]] * len(hour)

data = pd.DataFrame(data)
bins = np.arange(0, 25, 1)

ax = sns.histplot(
    data=data,
    x="Time",
    hue="Person",
    bins=bins,
    multiple=multiple,
    edgecolor=".3",
    linewidth=0.5,
    palette="dark",
    alpha=0.6,
)
ax.set_xticks(bins)
ax.set_xticklabels(bins)
ax.set_xlabel("Hour")
ax.set_xlim(0, 24)
sns.move_legend(ax, loc="upper center", bbox_to_anchor=(0.5, 1.2), ncol=2)

ax.figure.set_size_inches(8, 4)
ax.figure.set_dpi(150)
plt.show()
plt.close()

png

每周活跃分析

查看一周内从周一到周日每天发送的消息数

grouper = pd.Grouper(key="day")
data = df.groupby(grouper)["Count"].sum()
data = data.sort_index()
data.index = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

ax = sns.barplot(data=data, errorbar=None)
ax.set_xlabel("Weekday")
ax.bar_label(ax.containers[0], fontsize=10)

ax.figure.set_size_inches(5, 5)
ax.figure.set_dpi(150)
plt.show()
plt.close()

png

按周划分年度活跃分析

划分每7天内发送的消息数，可以得知每周的活跃的时间段

wTicks: 每个刻度相差的数值
wStart: 当年第一个周一的日期
wEnd: 次年第一个周一的日期

1
2
3

wTicks = 500
wStart = "2022-08-01"
wEnd = "2024-02-09"

grouper = pd.Grouper(key="StrTime", freq="W-MON")
data = df.groupby(grouper)["Count"].sum().to_frame()
data.index = pd.date_range(start=wStart, end=wEnd, freq="W-MON").strftime("%m-%d")
data.columns = ["Count"]

vM = np.ceil(data["Count"].max() / wTicks) * wTicks
norm = plt.Normalize(0, vM)
sm = plt.cm.ScalarMappable(cmap="Reds", norm=norm)

ax = sns.barplot(x=data.index, y=data["Count"], hue=data["Count"], hue_norm=norm, palette="Reds")
ax.set_xlabel("Date")
plt.xticks(rotation=60)
for bar in ax.containers:
    ax.bar_label(bar, fontsize=10, fmt="%.0f")
ax.get_legend().remove()

axpos = ax.get_position()
caxpos = mtransforms.Bbox.from_extents(axpos.x1 + 0.02, axpos.y0, axpos.x1 + 0.03, axpos.y1)
cax = ax.figure.add_axes(caxpos)

locator = mticker.MultipleLocator(wTicks)
formatter = mticker.StrMethodFormatter("{x:.0f}")
cax.figure.colorbar(sm, cax=cax, ticks=locator, format=formatter)

ax.figure.set_size_inches(20, 8)
ax.figure.set_dpi(150)
plt.show()
plt.close()

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)

png

按周划分聊天热情分析

划分每7天内的聊天热情指数，聊天热情指数为发送的消息数减去收到的消息数与总消息数的比值：
$$
E = \frac{Q_\mathrm{S} - Q_\mathrm{R}}{Q_\mathrm{S} + Q_\mathrm{R}}
$$

grouper = pd.Grouper(key="StrTime", freq="W-MON")
df_W1 = dfs[0].groupby(grouper)["Count"].sum()
df_W2 = dfs[1].groupby(grouper)["Count"].sum()

data = pd.DataFrame({"E": (df_W1 - df_W2) / (df_W1 + df_W2)})
data.index = pd.date_range(start=wStart, end=wEnd, freq="W-MON").strftime("%m-%d")

vM = data["E"].abs().max()
vm = data["E"].min()
norm = plt.Normalize(-vM, vM)
sm = plt.cm.ScalarMappable(cmap="coolwarm", norm=norm)
print(data.index)
print(data["E"])
if data["E"].isna().any():
    print("列 'E' 中包含NaN项")
else:
    print("列 'E' 中不包含NaN项")
data["E"].fillna(0, inplace=True)
ax = sns.barplot(x=data.index, y=data["E"], hue=data["E"], hue_norm=norm, palette="coolwarm")
ax.set_xlabel("Date")
plt.xticks(rotation=60)
ax.set_ylabel("Enthusiasm Index")
for bar in ax.containers:
    ax.bar_label(bar, fontsize=10, fmt="%.2f")
ax.get_legend().remove()

axpos = ax.get_position()
caxpos = mtransforms.Bbox.from_extents(axpos.x1 + 0.02, axpos.y0, axpos.x1 + 0.03, axpos.y1)
cax = ax.figure.add_axes(caxpos)

locator = mticker.MultipleLocator(0.1)
formatter = mticker.StrMethodFormatter("{x:.2f}")
cax.figure.colorbar(sm, cax=cax, ticks=locator, format=formatter)

ax.figure.set_size_inches(20, 8)
ax.figure.set_dpi(150)
plt.show()
plt.close()

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
F:\user\Temp\ipykernel_9048\3741989350.py:18: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

data[“E”].fillna(0, inplace=True)

Index(['08-01', '08-08', '08-15', '08-22', '08-29', '09-05', '09-12', '09-19',
       '09-26', '10-03', '10-10', '10-17', '10-24', '10-31', '11-07', '11-14',
       '11-21', '11-28', '12-05', '12-12', '12-19', '12-26', '01-02', '01-09',
       '01-16', '01-23', '01-30', '02-06', '02-13', '02-20', '02-27', '03-06',
       '03-13', '03-20', '03-27', '04-03', '04-10', '04-17', '04-24', '05-01',
       '05-08', '05-15', '05-22', '05-29', '06-05', '06-12', '06-19', '06-26',
       '07-03', '07-10', '07-17', '07-24', '07-31', '08-07', '08-14', '08-21',
       '08-28', '09-04', '09-11', '09-18', '09-25', '10-02', '10-09', '10-16',
       '10-23', '10-30', '11-06', '11-13', '11-20', '11-27', '12-04', '12-11',
       '12-18', '12-25', '01-01', '01-08', '01-15', '01-22', '01-29', '02-05'],
      dtype='object')
08-01    0.120000
08-08    0.109091
08-15    0.071633
08-22   -0.007092
08-29    0.075426
           ...   
01-08   -0.003752
01-15    0.100529
01-22    0.159184
01-29    0.068273
02-05    0.014085
Name: E, Length: 80, dtype: float64
列 'E' 中包含NaN项

png

按日划分年度活跃分析

以热力图的方式展示按日划分的年度活跃情况

grouper = pd.Grouper(key="StrTime", freq="D")
data = df.groupby(grouper)["Count"].sum()
data = data.to_frame()

wStart = "2023-01-01"
wEnd = "2024-01-01"

data["date"] = data.index
data["week"] = data["date"].dt.isocalendar()["week"]
data["day"] = data["date"].dt.dayofweek
data.index = range(len(data))
for i in range(7):
    if data.loc[i, "week"] > 1:
        data.loc[i, "week"] = 0

print(data)
data = data.pivot_table(index="day", columns="week", values="Count")
data.index = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
data.columns = pd.date_range(start=wStart, end=wEnd, freq="W-MON").strftime("%m-%d")

ax = sns.heatmap(
    data,
    annot=False,
    linewidths=0.5,
    cbar_kws={"orientation": "vertical", "location": "left", "pad": 0.03},
    cmap="Reds",
)
ax.set_xlabel("Week")
ax.set_ylabel("Weekday")
ax.figure.set_size_inches(24, 4)
ax.figure.set_dpi(150)
plt.show()
plt.close()

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)


     Count       date  week  day
0       46 2022-08-07     0    6
1      104 2022-08-08     0    0
2      100 2022-08-09     0    1
3       47 2022-08-10     0    2
4        3 2022-08-11     0    3
..     ...        ...   ...  ...
547     76 2024-02-05     6    0
548      0 2024-02-06     6    1
549      0 2024-02-07     6    2
550     47 2024-02-08     6    3
551     24 2024-02-09     6    4

[552 rows x 4 columns]

png

词语分析

分词词典、停止词与去除词性

jieba.load_userdict("thuocl.txt")
jieba.load_userdict("userdict.txt")
stopwords = [line.strip() for line in open("stopwords.txt", "r").readlines()] + [" ", "\n", "\r\n"]
wordclass = ["v", "u", "vd", "r", "p", "w"]

分词函数

def wordSplit(texts, wordclass):
    words = []
    pbar = tqdm(total=len(texts))
    for i in range(len(texts)):
        res = pseg.lcut(texts[i])
        for pair in res:
            if pair.word in stopwords:
                continue
            if pair.flag in wordclass:
                continue
            words.append(pair.word)
        if i % 1000 == 0:
            pbar.update(1000)
    pbar.close()
    return words

1	words = [wordSplit(texts[i], wordclass) for i in range(2)]

15000it [00:08, 1837.45it/s]                           
17000it [00:07, 2395.06it/s]

词云绘制

font: 字体路径，至少支持中文，最好同时支持中文和emoji
mask: 词云的蒙版，影响词云的形状
cmap: 色阶

font = "simsun.ttc"
mask = np.array(Image.open("mask.png"))
masks = [np.array(Image.open("mask_L.jpg")), np.array(Image.open("mask_F.jpg"))]
cmap = ListedColormap(
    [
        "#fac1cf",
        "#a9d7ba",
        "#58b1db",
        "#f296ab",
        "#5dab81",
        "#3d9ec4",
        "#e16a8d",
        "#237b50",
        "#1e8299",
        "#8d3549",
        "#35563b",
        "#2d5d73",
    ]
)

def wordCloud(text, font, mask, cmap):
    wc = WordCloud(
        background_color="white",
        scale=5,
        font_path=font,
        mask=mask,
        colormap=cmap,
        collocations=False,
    ).generate(text)
    plt.imshow(wc)
    plt.axis("off")
    plt.show()

1	wordCloud(" ".join(words[0]), font, masks[0], cmap)

png

1	wordCloud(" ".join(words[1]), font, masks[1], cmap)

png

1	wordCloud(" ".join(words[0] + words[1]), font, mask, cmap)

png

高频词排行

列出常用的 N 个词，并且展示双方的贡献

wN: 词的数目，默认为50

wN = 50

data = pd.DataFrame(
    {
        "words": words[0] + words[1],
        "L": [1] * len(words[0]) + [0] * len(words[1]),
        "F": [0] * len(words[0]) + [1] * len(words[1]),
        "S": [1] * len(words[0]) + [1] * len(words[1]),
    }
)

grouper = pd.Grouper(key="words")
data = data.groupby(grouper).sum()
data = data.sort_values(by="S", ascending=False)
data = data.iloc[:wN]

# 将部分无法识别的 emoji 转化为文字
tmp = data.index.to_list()
for i in range(wN):
    if tmp[i] == "😘":
        tmp[i] = "[亲亲]"
    elif tmp[i] == "😂":
        tmp[i] = "[笑哭]"
    elif tmp[i] == "🤦":
        tmp[i] = "[捂脸]"
    elif tmp[i] == "😁":
        tmp[i] = "[呲牙]"
data.index = tmp

ratio = data["L"] / data["S"]
norm = plt.Normalize(0, 1)
sm = plt.cm.ScalarMappable(cmap="coolwarm", norm=norm)

fig = plt.figure(figsize=(10, 10), dpi=300)
grid = plt.GridSpec(1, 4, wspace=0.5)

ax0 = fig.add_subplot(grid[0, 0])
sns.barplot(x=-data["L"], y=data.index, ax=ax0, hue=ratio, hue_norm=norm, palette="coolwarm")
ax1 = fig.add_subplot(grid[0, 1:])
sns.barplot(x=data["F"], y=data.index, ax=ax1, hue=(1 - ratio), hue_norm=norm, palette="coolwarm")

ax0.set_xlabel("Word Frequency")
ax0.set_ylabel("")
ax0.set_xticks(range(-400, 1, 200))
ax0.set_xticklabels([400, 200, 0])
ax0.set_xlim(-400, 0)
ax0.set_yticks([])
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.set_title("WJY")
ax0.get_legend().remove()

ax1.set_xlabel("Word Frequency")
ax1.set_ylabel("")
ax1.set_xticks(range(0, 1201, 200))
ax1.set_xticklabels([0, 200, 400, 600, 800, 1000, 1200])
ax1.set_xlim(0, 1200)
ax1.set_yticks([])
ax1.spines["left"].set_visible(False)
ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)
ax1.set_title("XJH")
ax1.get_legend().remove()

axpos = ax1.get_position()
caxpos = mtransforms.Bbox.from_extents(axpos.x0 + 0.06, axpos.y0 + 0.03, axpos.x1, axpos.y0 + 0.04)
cax = ax1.figure.add_axes(caxpos)

locator = mticker.MultipleLocator(0.1)
formatter = mticker.StrMethodFormatter("{x:.1f}")
cax.figure.colorbar(sm, cax=cax, orientation="horizontal", ticks=locator, format=formatter)
# cax.tick_params(top=True, labeltop=True, bottom=False, labelbottom=False)
cax.set_title("ratio")

x0 = ax0.get_position().x1
x1 = ax1.get_position().x0
xm = (x0 + x1) / 2
y0 = ax0.get_position().y0
y1 = ax0.get_position().y1

for i in range(wN):
    fig.text(
        xm, y0 + (y1 - y0) * (wN - i - 0.5) / wN, data.index[i],
        color="black", ha="center", va="center", fontproperties=fp
    )

fig.set_dpi(150)
plt.show()
plt.close()

png

情感分析

使用 paddlenlp 进行情感分析，得到的分数在 [-1, 1] 之间，越小越消极，越大越积极

1 2	dfE = df.query("Type == 1")[["IsSender", "StrContent", "StrTime", "hour"]] dfE.index = range(len(dfE))

1	senta = Taskflow("sentiment_analysis", home_path="./")

1 2	scores = pd.DataFrame(senta([textFilter(i) for i in dfE["StrContent"].to_list()])) scores.loc[scores["label"] == "negative", "score"] = 1 - scores.loc[scores["label"] == "negative", "score"]

dfE["score"] = scores["score"]
dfE["score"] = 2 * dfE["score"] - 1
dfE["Person"] = dfE.apply(lambda x: labels[x["IsSender"]], axis=1)

dfEs = [dfE.query("IsSender == 0"), dfE.query("IsSender == 1")]

年度总体情感分布

ax = sns.histplot(data=dfE, x="score", hue="Person", palette="dark", alpha=0.6, bins=100)

ax.set_xlabel("Sentiment Score")
ax.set_ylabel("Count")
ax.set_title("Sentiment Distribution")
ax.set_xlim(-1, 1)

ax.figure.set_size_inches(8, 3)
ax.figure.set_dpi(150)
plt.show()

png

按周统计平均情感指数

def weekAvgSenScore(df):
    wStart = "2022-08-01"
    wEnd = "2024-02-10"

    grouper = pd.Grouper(key="StrTime", freq="W-MON")
    data = df.groupby(grouper)["score"].mean().to_frame()
    data.index = pd.date_range(start=wStart, end=wEnd, freq="W-MON").strftime("%m-%d")
    data.columns = ["score"]

    vM = data["score"].abs().max()
    norm = plt.Normalize(-vM, vM)
    sm = plt.cm.ScalarMappable(cmap="coolwarm", norm=norm)
    data["score"].fillna(0, inplace=True)

    ax = sns.barplot(x=data.index, y=data["score"], hue=data["score"], hue_norm=norm, palette="coolwarm")
    ax.set_xlabel("Date")
    plt.xticks(rotation=60)
    for bar in ax.containers:
        ax.bar_label(bar, fontsize=10, fmt="%.2f")
    ax.get_legend().remove()

    axpos = ax.get_position()
    caxpos = mtransforms.Bbox.from_extents(axpos.x1 + 0.02, axpos.y0, axpos.x1 + 0.03, axpos.y1)
    cax = ax.figure.add_axes(caxpos)

    locator = mticker.MultipleLocator(0.02)
    formatter = mticker.StrMethodFormatter("{x:.2f}")
    cax.figure.colorbar(sm, cax=cax, ticks=locator, format=formatter)

    ax.figure.set_size_inches(20, 8)
    ax.figure.set_dpi(150)
    plt.show()
    plt.close()

    return data["score"]

1	avgSenScore0 = weekAvgSenScore(dfEs[0])

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
F:\user\Temp\ipykernel_9048\3472776600.py:13: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

data[“score”].fillna(0, inplace=True)

png

1	avgSenScore1 = weekAvgSenScore(dfEs[1])

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
F:\user\Temp\ipykernel_9048\3472776600.py:13: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

data[“score”].fillna(0, inplace=True)

png

1	_ = weekAvgSenScore(dfE)

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
F:\user\Temp\ipykernel_9048\3472776600.py:13: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

data[“score”].fillna(0, inplace=True)

png

ax = sns.lineplot(data=avgSenScore0, linewidth=3, marker="s", markersize=15, label=labels[0])
ax = sns.lineplot(data=avgSenScore1, linewidth=3, marker="^", markersize=15, ax=ax, label=labels[1])

ax.set_xlabel("Date")
plt.xticks(rotation=60)
ax.set_ylabel("Average Sentiment Score")
ax.set_xlim(0, 52)
ax.legend(prop={"size": 24})

ax.figure.set_size_inches(20, 8)
ax.figure.set_dpi(150)
plt.show()
plt.close()

png

按周统计累计情感指数

def weekTotSenScore(df):
    wStart = "2022-08-01"
    wEnd = "2024-02-10"
    grouper = pd.Grouper(key="StrTime", freq="W-MON")
    data = df.groupby(grouper)["score"].sum().to_frame()
    data.index = pd.date_range(start=wStart, end=wEnd, freq="W-MON").strftime("%m-%d")
    data.columns = ["score"]

    vM = data["score"].abs().max()
    norm = plt.Normalize(-vM, vM)
    sm = plt.cm.ScalarMappable(cmap="coolwarm", norm=norm)

    ax = sns.barplot(x=data.index, y=data["score"], hue=data["score"], hue_norm=norm, palette="coolwarm")
    ax.set_xlabel("Date")
    plt.xticks(rotation=60)
    for bar in ax.containers:
        ax.bar_label(bar, fontsize=10, fmt="%.2f")
    ax.get_legend().remove()

    axpos = ax.get_position()
    caxpos = mtransforms.Bbox.from_extents(axpos.x1 + 0.02, axpos.y0, axpos.x1 + 0.03, axpos.y1)
    cax = ax.figure.add_axes(caxpos)

    locator = mticker.MultipleLocator(20)
    formatter = mticker.StrMethodFormatter("{x:.2f}")
    cax.figure.colorbar(sm, cax=cax, ticks=locator, format=formatter)

    ax.figure.set_size_inches(20, 8)
    ax.figure.set_dpi(150)
    plt.show()
    plt.close()

    return data["score"]

1	totSenScore0 = weekTotSenScore(dfEs[0])

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)

png

1	totSenScore1 = weekTotSenScore(dfEs[1])

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)

png

1	_ = weekTotSenScore(dfE)

f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)
f:\ProgramData\miniconda3\envs\wechatmsg\lib\site-packages\pandas\core\groupby\grouper.py:381: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Index constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
  ax = Index(obj[key], name=key)

png

ax = sns.lineplot(data=totSenScore0, linewidth=3, marker="s", markersize=15, label=labels[0])
ax = sns.lineplot(data=totSenScore1, linewidth=3, marker="^", markersize=15, ax=ax, label=labels[1])

ax.set_xlabel("Date")
plt.xticks(rotation=60)
ax.set_ylabel("Total Sentiment Score")
ax.set_xlim(0, 52)
ax.legend(prop={"size": 24})

ax.figure.set_size_inches(20, 8)
ax.figure.set_dpi(150)
plt.show()
plt.close()

png

每日平均情感分析

grouper = pd.Grouper(key="hour")

data = []
for k in range(2):
    tmp = dfEs[k].groupby(grouper)["score"].mean().sort_index()
    for i in range(24):
        if i in tmp.index:
            data.append(tmp[i])
        else:
            data.append(0)
    data.append(0)
data = pd.DataFrame(
    {
        "Score": data,
        "Person": [labels[0]] * 25 + [labels[1]] * 25,
    }
)

xBins = [i for i in range(25)]
ax = sns.histplot(
    data=data,
    x=xBins * 2,
    bins=xBins,
    weights="Score",
    hue="Person",
    multiple=multiple,
    edgecolor=".3",
    linewidth=0.5,
    palette="dark",
    alpha=0.6,
)

ax.set_xticks(range(25))
ax.set_xticklabels(range(25))
ax.set_xlabel("Hour")
ax.set_xlim(0, 24)
ax.set_ylim(np.min([0, np.floor(data["Score"].min() / 0.05) * 0.05]), np.ceil(data["Score"].max() / 0.05) * 0.05)
sns.move_legend(ax, loc="upper center", bbox_to_anchor=(0.5, 1.2), ncol=2)

ax.figure.set_size_inches(8, 4)
ax.figure.set_dpi(150)
plt.show()
plt.close()

png

每日累计情感分析

grouper = pd.Grouper(key="hour")

data = []
for k in range(2):
    tmp = dfEs[k].groupby(grouper)["score"].sum().sort_index()
    for i in range(24):
        if i in tmp.index:
            data.append(tmp[i])
        else:
            data.append(0)
    data.append(0)
data = pd.DataFrame(
    {
        "Score": data,
        "Person": [labels[0]] * 25 + [labels[1]] * 25,
    }
)

xBins = [i for i in range(25)]
ax = sns.histplot(
    data=data,
    x=xBins * 2,
    bins=xBins,
    weights="Score",
    hue="Person",
    multiple=multiple,
    edgecolor=".3",
    linewidth=0.5,
    palette="dark",
    alpha=0.6,
)

ax.set_xticks(range(25))
ax.set_xticklabels(range(25))
ax.set_xlabel("Hour")
ax.set_xlim(0, 24)
ax.set_ylim(np.min([0, np.floor(data["Score"].min() / 0.05) * 0.05]), np.ceil(data["Score"].max() / 0.05) * 0.05)
sns.move_legend(ax, loc="upper center", bbox_to_anchor=(0.5, 1.2), ncol=2)

ax.figure.set_size_inches(8, 4)
ax.figure.set_dpi(150)
plt.show()
plt.close()

png

QQ聊天记录分析 QQMsgAnalysis

http://asgard-tim.github.io/2024/02/11/QQmsganalysis/

作者

Tim

发布于

2024-02-11

更新于

2024-02-13

许可协议

#Python 数据处理分析可视化

QQ聊天记录分析 QQMsgAnalysis

准备阶段

引入包

数据预处理

绘图设置

人名标签

数据读取

消息过滤

获取文本消息

消息频率分析

类型分析

消息长度分析

每日活跃分析

每周活跃分析

按周划分年度活跃分析

按周划分聊天热情分析

按日划分年度活跃分析

词语分析

分词词典、停止词与去除词性

分词函数

词云绘制

高频词排行

情感分析

年度总体情感分布

按周统计平均情感指数

按周统计累计情感指数

每日平均情感分析

每日累计情感分析

作者

发布于

更新于

许可协议

喜欢这篇文章？打赏一下作者吧

评论

目录

最新文章

分类

归档

标签